1use std::collections::HashMap;
9use std::sync::Arc;
10use std::sync::Mutex as StdMutex;
11use std::sync::atomic::{AtomicBool, Ordering};
12use std::time::{Duration, Instant};
13
14use async_trait::async_trait;
15use parking_lot::Mutex as SyncMutex;
16use thiserror::Error;
17use tokio::sync::{Mutex, RwLock, Semaphore, mpsc};
18use tokio::task::{JoinHandle, JoinSet};
19
20use awaken_contract::contract::event::AgentEvent;
21use awaken_contract::contract::event_sink::EventSink;
22use awaken_contract::contract::lifecycle::{RunStatus, TerminationReason};
23use awaken_contract::contract::mailbox::{
24 DispatchSignalEntry, LiveDeliveryOutcome, LiveRunCommand, LiveRunTarget, MailboxInterrupt,
25 MailboxInterruptDetails, MailboxStore, RunDispatch, RunDispatchResult, RunDispatchStatus,
26};
27use awaken_contract::contract::message::Message;
28use awaken_contract::contract::storage::{
29 MessageSeqRange, RunMessageInput, RunRecord, RunRequestSnapshot, RunResumeDecision,
30 StorageError, ThreadRunStore,
31};
32use awaken_contract::contract::suspension::{ToolCallOutcome, ToolCallResume};
33use awaken_contract::contract::tool_intercept::{AdapterKind, RunMode};
34use awaken_contract::now_ms;
35use awaken_runtime::loop_runner::{AgentLoopError, AgentRunResult};
36use awaken_runtime::{AgentRuntime, RunRequest, ThreadContextSnapshot};
37
38use crate::transport::channel_sink::ReconnectableEventSink;
39
40const INLINE_CLAIM_GUARD_MS: u64 = 60_000;
43#[cfg(not(test))]
44const REMOTE_CANCEL_WAIT_MS: u64 = 5_000;
45#[cfg(test)]
46const REMOTE_CANCEL_WAIT_MS: u64 = 250;
47const REMOTE_CANCEL_POLL_MS: u64 = 25;
48const DISPATCH_SIGNAL_BATCH_DEFAULT: usize = 32;
49const DISPATCH_SIGNAL_EXPIRES_DEFAULT: Duration = Duration::from_millis(500);
50const DISPATCH_SIGNAL_ERROR_DELAY: Duration = Duration::from_millis(250);
51const DISPATCH_SIGNAL_BLOCKED_NACK_BASE_DELAY_DEFAULT: Duration = Duration::from_millis(500);
52const DISPATCH_SIGNAL_BLOCKED_NACK_MAX_DELAY_DEFAULT: Duration = Duration::from_secs(30);
53const DISPATCH_SIGNAL_BATCH_ENV: &str = "AWAKEN_DISPATCH_SIGNAL_BATCH_SIZE";
54const DISPATCH_SIGNAL_EXPIRES_ENV: &str = "AWAKEN_DISPATCH_SIGNAL_FETCH_EXPIRES_MS";
55const DISPATCH_SIGNAL_NACK_BASE_DELAY_ENV: &str = "AWAKEN_DISPATCH_SIGNAL_NACK_BASE_DELAY_MS";
56const DISPATCH_SIGNAL_NACK_MAX_DELAY_ENV: &str = "AWAKEN_DISPATCH_SIGNAL_NACK_MAX_DELAY_MS";
57const DISPATCH_SIGNAL_MAX_CONCURRENT_HANDLERS_DEFAULT: usize = 32;
58const DISPATCH_SIGNAL_MAX_CONCURRENT_HANDLERS_ENV: &str =
59 "AWAKEN_DISPATCH_SIGNAL_MAX_CONCURRENT_HANDLERS";
60const TERMINAL_RECONCILE_BATCH: usize = 100;
61const MAILBOX_DEPTH_STATUSES: [RunDispatchStatus; 6] = [
62 RunDispatchStatus::Queued,
63 RunDispatchStatus::Claimed,
64 RunDispatchStatus::Acked,
65 RunDispatchStatus::Cancelled,
66 RunDispatchStatus::Superseded,
67 RunDispatchStatus::DeadLetter,
68];
69
70pub(crate) const ACTIVE_RUN_CONFLICT_MESSAGE: &str =
72 "thread has an active run; cannot claim inline";
73
74#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
79struct RunRequestExtras {
80 #[serde(default, skip_serializing_if = "Option::is_none")]
81 overrides: Option<awaken_contract::contract::inference::InferenceOverride>,
82 #[serde(default, skip_serializing_if = "Vec::is_empty")]
83 decisions: Vec<(
84 String,
85 awaken_contract::contract::suspension::ToolCallResume,
86 )>,
87 #[serde(default, skip_serializing_if = "Vec::is_empty")]
88 frontend_tools: Vec<awaken_contract::contract::tool::ToolDescriptor>,
89 #[serde(default, skip_serializing_if = "Option::is_none")]
90 continue_run_id: Option<String>,
91 #[serde(default, skip_serializing_if = "Option::is_none")]
92 run_id_hint: Option<String>,
93 #[serde(default, skip_serializing_if = "Option::is_none")]
94 dispatch_id_hint: Option<String>,
95 #[serde(default, skip_serializing_if = "Option::is_none")]
96 parent_thread_id: Option<String>,
97 #[serde(default, skip_serializing_if = "Option::is_none")]
98 transport_request_id: Option<String>,
99 #[serde(default)]
100 run_mode: RunMode,
101 #[serde(default)]
102 adapter: AdapterKind,
103}
104
105impl RunRequestExtras {
106 fn from_request(request: &awaken_runtime::RunRequest) -> Self {
107 Self {
108 overrides: request.overrides.clone(),
109 decisions: request.decisions.clone(),
110 frontend_tools: request.frontend_tools.clone(),
111 continue_run_id: request.continue_run_id.clone(),
112 run_id_hint: request.run_id_hint.clone(),
113 dispatch_id_hint: request.dispatch_id_hint.clone(),
114 parent_thread_id: request.parent_thread_id.clone(),
115 transport_request_id: request.transport_request_id.clone(),
116 run_mode: request.run_mode,
117 adapter: request.adapter,
118 }
119 }
120
121 fn to_value(&self) -> Result<Option<serde_json::Value>, serde_json::Error> {
122 if self.overrides.is_none()
123 && self.decisions.is_empty()
124 && self.frontend_tools.is_empty()
125 && self.continue_run_id.is_none()
126 && self.run_id_hint.is_none()
127 && self.dispatch_id_hint.is_none()
128 && self.parent_thread_id.is_none()
129 && self.transport_request_id.is_none()
130 && self.run_mode == RunMode::Foreground
131 && self.adapter == AdapterKind::Internal
132 {
133 Ok(None)
134 } else {
135 serde_json::to_value(self).map(Some)
136 }
137 }
138
139 fn from_value(value: &serde_json::Value) -> Result<Self, serde_json::Error> {
140 serde_json::from_value(value.clone())
141 }
142
143 fn apply_to(self, mut request: awaken_runtime::RunRequest) -> awaken_runtime::RunRequest {
144 if let Some(ov) = self.overrides {
145 request = request.with_overrides(ov);
146 }
147 if !self.decisions.is_empty() {
148 request = request.with_decisions(self.decisions);
149 }
150 if !self.frontend_tools.is_empty() {
151 request = request.with_frontend_tools(self.frontend_tools);
152 }
153 if let Some(crid) = self.continue_run_id {
154 request = request.with_continue_run_id(crid);
155 }
156 if let Some(run_id_hint) = self.run_id_hint {
157 request = request.with_run_id_hint(run_id_hint);
158 }
159 if let Some(dispatch_id_hint) = self.dispatch_id_hint {
160 request = request.with_dispatch_id_hint(dispatch_id_hint);
161 }
162 if let Some(parent_thread_id) = self.parent_thread_id {
163 request = request.with_parent_thread_id(parent_thread_id);
164 }
165 if let Some(transport_request_id) = self.transport_request_id {
166 request = request.with_transport_request_id(transport_request_id);
167 }
168 request
169 .with_run_mode(self.run_mode)
170 .with_adapter(self.adapter)
171 }
172}
173
174pub struct TaskDoneMailboxNotify {
182 mailbox: Arc<Mailbox>,
183 thread_id: String,
184 continue_run_id: Option<String>,
185}
186
187impl TaskDoneMailboxNotify {
188 pub fn new(mailbox: Arc<Mailbox>, thread_id: String, continue_run_id: Option<String>) -> Self {
189 Self {
190 mailbox,
191 thread_id,
192 continue_run_id,
193 }
194 }
195}
196
197impl awaken_runtime::inbox::OnInboxClosed for TaskDoneMailboxNotify {
198 fn closed(&self, message: &serde_json::Value) {
199 let mailbox = self.mailbox.clone();
200 let thread_id = self.thread_id.clone();
201 let continue_run_id = self.continue_run_id.clone();
202 let wake_message = awaken_runtime::inbox::inbox_event_message(message);
203
204 tokio::spawn(async move {
206 let mut request = RunRequest::new(thread_id.clone(), vec![wake_message])
207 .with_origin(awaken_contract::contract::storage::RunRequestOrigin::Internal)
208 .with_run_mode(RunMode::InternalWake)
209 .with_adapter(AdapterKind::Internal);
210 if let Some(run_id) = continue_run_id {
211 request = request.with_continue_run_id(run_id);
212 }
213 if let Err(e) = mailbox.submit_background(request).await {
214 tracing::warn!(thread_id, error = %e, "failed to enqueue background task wake dispatch");
215 }
216 });
217 }
218}
219
220#[derive(Debug, Clone)]
224pub struct MailboxSubmitResult {
225 pub dispatch_id: String,
226 pub run_id: String,
227 pub thread_id: String,
228 pub status: MailboxDispatchStatus,
229}
230
231#[derive(Debug, Clone, Copy, PartialEq, Eq)]
233pub enum MailboxDispatchStatus {
234 Running,
236 Queued,
238}
239
240#[derive(Debug, Error)]
242pub enum MailboxError {
243 #[error("validation error: {0}")]
244 Validation(String),
245 #[error("store error: {0}")]
246 Store(#[from] StorageError),
247 #[error("internal error: {0}")]
248 Internal(String),
249}
250
251#[derive(Debug)]
253pub enum MailboxRunOutcome {
254 Completed,
256 TransientError(String),
258 PermanentError(String),
260}
261
262impl MailboxRunOutcome {
263 fn metric_label(&self) -> &'static str {
264 match self {
265 Self::Completed => "completed",
266 Self::TransientError(_) => "transient_error",
267 Self::PermanentError(_) => "permanent_error",
268 }
269 }
270}
271
272#[async_trait]
279pub trait RunDispatchExecutor: Send + Sync {
280 async fn run(
282 &self,
283 request: RunRequest,
284 sink: Arc<dyn EventSink>,
285 ) -> Result<AgentRunResult, AgentLoopError>;
286
287 async fn run_with_thread_context(
289 &self,
290 request: RunRequest,
291 sink: Arc<dyn EventSink>,
292 thread_ctx: Option<ThreadContextSnapshot>,
293 ) -> Result<AgentRunResult, AgentLoopError> {
294 let _ = thread_ctx;
295 self.run(request, sink).await
296 }
297
298 fn cancel(&self, id: &str) -> bool;
300
301 async fn cancel_and_wait_by_thread(&self, thread_id: &str) -> bool;
303
304 fn send_decision(&self, id: &str, tool_call_id: String, resume: ToolCallResume) -> bool;
306
307 fn send_messages(&self, id: &str, messages: Vec<Message>) -> bool {
309 let _ = (id, messages);
310 false
311 }
312}
313
314#[async_trait]
315impl RunDispatchExecutor for AgentRuntime {
316 async fn run(
317 &self,
318 request: RunRequest,
319 sink: Arc<dyn EventSink>,
320 ) -> Result<AgentRunResult, AgentLoopError> {
321 AgentRuntime::run(self, request, sink).await
322 }
323
324 async fn run_with_thread_context(
325 &self,
326 request: RunRequest,
327 sink: Arc<dyn EventSink>,
328 thread_ctx: Option<ThreadContextSnapshot>,
329 ) -> Result<AgentRunResult, AgentLoopError> {
330 AgentRuntime::run_with_thread_context(self, request, sink, thread_ctx).await
331 }
332
333 fn cancel(&self, id: &str) -> bool {
334 AgentRuntime::cancel(self, id)
335 }
336
337 async fn cancel_and_wait_by_thread(&self, thread_id: &str) -> bool {
338 AgentRuntime::cancel_and_wait_by_thread(self, thread_id).await
339 }
340
341 fn send_decision(&self, id: &str, tool_call_id: String, resume: ToolCallResume) -> bool {
342 AgentRuntime::send_decision(self, id, tool_call_id, resume)
343 }
344
345 fn send_messages(&self, id: &str, messages: Vec<Message>) -> bool {
346 AgentRuntime::send_messages(self, id, messages)
347 }
348}
349
350#[derive(Debug, Clone)]
352pub struct MailboxConfig {
353 pub lease_ms: u64,
355 pub suspended_lease_ms: u64,
358 pub lease_renewal_interval: Duration,
360 pub sweep_interval: Duration,
362 pub gc_interval: Duration,
364 pub gc_ttl: Duration,
366 pub default_max_attempts: u32,
368 pub default_retry_delay_ms: u64,
370 pub max_retry_delay_ms: u64,
372}
373
374impl Default for MailboxConfig {
375 fn default() -> Self {
376 Self {
377 lease_ms: 30_000,
378 suspended_lease_ms: 600_000,
379 lease_renewal_interval: Duration::from_secs(10),
380 sweep_interval: Duration::from_secs(30),
381 gc_interval: Duration::from_secs(60),
382 gc_ttl: Duration::from_secs(24 * 60 * 60),
383 default_max_attempts: 5,
384 default_retry_delay_ms: 250,
385 max_retry_delay_ms: 30_000,
386 }
387 }
388}
389
390pub type MailboxMaintenanceCallback = Arc<dyn Fn() + Send + Sync + 'static>;
392
393#[derive(Clone)]
395pub struct MailboxStartupRecoveryConfig {
396 pub max_attempts: u32,
399 pub retry_delay: Duration,
401}
402
403impl Default for MailboxStartupRecoveryConfig {
404 fn default() -> Self {
405 Self {
406 max_attempts: 1,
407 retry_delay: Duration::from_millis(250),
408 }
409 }
410}
411
412#[derive(Clone)]
414pub struct MailboxLifecycleConfig {
415 pub startup_delay: Duration,
417 pub startup_recovery: MailboxStartupRecoveryConfig,
419 pub maintenance_callback: Option<MailboxMaintenanceCallback>,
421}
422
423impl Default for MailboxLifecycleConfig {
424 fn default() -> Self {
425 Self {
426 startup_delay: Duration::ZERO,
427 startup_recovery: MailboxStartupRecoveryConfig::default(),
428 maintenance_callback: None,
429 }
430 }
431}
432
433#[derive(Clone)]
438pub struct MailboxLifecycleHandle {
439 tasks: Arc<StdMutex<Option<MailboxLifecycleTasks>>>,
440 transition_lock: Arc<Mutex<()>>,
441}
442
443impl MailboxLifecycleHandle {
444 pub fn abort(&self) {
446 if let Some(tasks) = self.tasks.lock().expect("lifecycle lock poisoned").take() {
447 tasks.abort();
448 }
449 }
450
451 pub async fn shutdown(&self) -> Result<(), MailboxError> {
457 let _transition_guard = self.transition_lock.lock().await;
458 let tasks = self.tasks.lock().expect("lifecycle lock poisoned").take();
459 if let Some(tasks) = tasks {
460 tasks.shutdown().await?;
461 }
462 Ok(())
463 }
464
465 pub fn is_running(&self) -> bool {
467 self.tasks
468 .lock()
469 .expect("lifecycle lock poisoned")
470 .is_some()
471 }
472}
473
474struct MailboxLifecycleTasks {
475 recover_task: Option<JoinHandle<()>>,
476 dispatch_signal_task: Option<JoinHandle<()>>,
477 maintenance_task: JoinHandle<()>,
478}
479
480impl MailboxLifecycleTasks {
481 fn abort(self) {
482 if let Some(task) = self.recover_task {
483 task.abort();
484 }
485 if let Some(task) = self.dispatch_signal_task {
486 task.abort();
487 }
488 self.maintenance_task.abort();
489 }
490
491 async fn shutdown(self) -> Result<(), MailboxError> {
492 if let Some(task) = self.recover_task {
493 task.abort();
494 await_lifecycle_task("mailbox startup recovery", task).await?;
495 }
496 if let Some(task) = self.dispatch_signal_task {
497 task.abort();
498 await_lifecycle_task("mailbox dispatch signal loop", task).await?;
499 }
500 self.maintenance_task.abort();
501 await_lifecycle_task("mailbox maintenance", self.maintenance_task).await
502 }
503}
504
505async fn await_lifecycle_task(name: &str, task: JoinHandle<()>) -> Result<(), MailboxError> {
506 match task.await {
507 Ok(()) => Ok(()),
508 Err(error) if error.is_cancelled() => Ok(()),
509 Err(error) if error.is_panic() => Err(MailboxError::Internal(format!("{name} panicked"))),
510 Err(error) => Err(MailboxError::Internal(format!("{name} failed: {error}"))),
511 }
512}
513
514enum MailboxWorkerStatus {
518 Idle,
519 Claiming,
522 Running {
523 dispatch_id: String,
524 run_id: String,
525 lease_handle: JoinHandle<()>,
526 sink: Arc<ReconnectableEventSink>,
527 },
528}
529
530#[derive(Debug, Clone, Copy, PartialEq, Eq)]
531enum DispatchAttempt {
532 Claimed,
533 Busy,
534 NoEligible,
535 TransientError,
536}
537
538impl DispatchAttempt {
539 fn started_execution(self) -> bool {
540 matches!(self, DispatchAttempt::Claimed)
541 }
542}
543
544struct ThreadContext {
546 messages: Vec<Message>,
547 latest_run: Option<RunRecord>,
548 run_cache: HashMap<String, RunRecord>,
549}
550
551impl ThreadContext {
552 async fn load(run_store: &dyn ThreadRunStore, thread_id: &str) -> Result<Self, MailboxError> {
553 let messages = run_store
554 .load_messages(thread_id)
555 .await?
556 .unwrap_or_default();
557 let latest_run = run_store.latest_run(thread_id).await?;
558 let mut run_cache = HashMap::new();
559 if let Some(ref run) = latest_run {
560 run_cache.insert(run.run_id.clone(), run.clone());
561 }
562 Ok(Self {
563 messages,
564 latest_run,
565 run_cache,
566 })
567 }
568
569 fn get_run(&self, run_id: &str) -> Option<&RunRecord> {
570 self.run_cache.get(run_id)
571 }
572
573 fn apply_checkpoint(&mut self, messages: &[Message], run: &RunRecord) {
574 self.messages = messages.to_vec();
575 self.latest_run = Some(run.clone());
576 self.run_cache.insert(run.run_id.clone(), run.clone());
577 }
578}
579
580struct MailboxWorker {
582 status: MailboxWorkerStatus,
583 thread_ctx: Option<ThreadContext>,
584}
585
586impl Default for MailboxWorker {
587 fn default() -> Self {
588 Self {
589 status: MailboxWorkerStatus::Idle,
590 thread_ctx: None,
591 }
592 }
593}
594
595struct SuspensionAwareSink {
601 inner: Arc<dyn EventSink>,
602 suspended: Arc<AtomicBool>,
603}
604
605#[async_trait]
606impl EventSink for SuspensionAwareSink {
607 async fn emit(&self, event: AgentEvent) {
608 if matches!(
609 &event,
610 AgentEvent::ToolCallDone {
611 outcome: ToolCallOutcome::Suspended,
612 ..
613 }
614 ) {
615 self.suspended.store(true, Ordering::Release);
616 }
617 if matches!(&event, AgentEvent::ToolCallResumed { .. }) {
619 self.suspended.store(false, Ordering::Release);
620 }
621 self.inner.emit(event).await;
622 }
623
624 async fn close(&self) {
625 self.inner.close().await;
626 }
627}
628
629struct ActiveRunGuard;
631
632impl Drop for ActiveRunGuard {
633 fn drop(&mut self) {
634 crate::metrics::dec_active_runs();
635 }
636}
637
638pub struct Mailbox {
647 executor: Arc<dyn RunDispatchExecutor>,
648 store: Arc<dyn MailboxStore>,
649 run_store: Arc<dyn ThreadRunStore>,
650 consumer_id: String,
651 workers: RwLock<HashMap<String, Arc<SyncMutex<MailboxWorker>>>>,
652 config: MailboxConfig,
653 lifecycle_tasks: Arc<StdMutex<Option<MailboxLifecycleTasks>>>,
654 lifecycle_start_lock: Arc<Mutex<()>>,
655}
656
657impl Mailbox {
658 pub fn new<R>(
660 executor: Arc<R>,
661 store: Arc<dyn MailboxStore>,
662 run_store: Arc<dyn ThreadRunStore>,
663 consumer_id: String,
664 config: MailboxConfig,
665 ) -> Self
666 where
667 R: RunDispatchExecutor + 'static,
668 {
669 Self::new_with_executor(executor, store, run_store, consumer_id, config)
670 }
671
672 pub fn new_with_executor(
674 executor: Arc<dyn RunDispatchExecutor>,
675 store: Arc<dyn MailboxStore>,
676 run_store: Arc<dyn ThreadRunStore>,
677 consumer_id: String,
678 config: MailboxConfig,
679 ) -> Self {
680 Self {
681 executor,
682 store,
683 run_store,
684 consumer_id,
685 workers: RwLock::new(HashMap::new()),
686 config,
687 lifecycle_tasks: Arc::new(StdMutex::new(None)),
688 lifecycle_start_lock: Arc::new(Mutex::new(())),
689 }
690 }
691
692 async fn refresh_dispatch_depth_metrics(&self) {
693 for status in MAILBOX_DEPTH_STATUSES {
694 match self.store.count_dispatches_by_status(status).await {
695 Ok(count) => {
696 let depth = count as f64;
697 crate::metrics::set_mailbox_dispatch_depth(
698 dispatch_status_label(status),
699 depth,
700 );
701 if status == RunDispatchStatus::Queued {
702 crate::metrics::set_mailbox_queue_depth(depth);
703 }
704 }
705 Err(error) => {
706 tracing::debug!(
707 status = dispatch_status_label(status),
708 error = %error,
709 "mailbox dispatch depth metric unavailable"
710 );
711 return;
712 }
713 }
714 }
715 }
716
717 async fn enqueue_dispatch_with_metrics(
718 &self,
719 dispatch: &RunDispatch,
720 ) -> Result<(), StorageError> {
721 let start = Instant::now();
722 let result = self.store.enqueue(dispatch).await;
723 record_mailbox_operation_result("enqueue", result_label(&result), start);
724 if result.is_ok() {
725 self.refresh_dispatch_depth_metrics().await;
726 }
727 result
728 }
729
730 const EVENT_CHANNEL_CAPACITY: usize = 256;
734
735 #[tracing::instrument(skip(self, request), fields(thread_id = %request.thread_id))]
740 pub async fn submit(
741 self: &Arc<Self>,
742 mut request: RunRequest,
743 ) -> Result<(MailboxSubmitResult, mpsc::Receiver<AgentEvent>), MailboxError> {
744 normalize_mailbox_run_mode(&mut request, false);
745 let (thread_id, messages) = validate_run_inputs(
746 request.thread_id.clone(),
747 request.messages.clone(),
748 !request.decisions.is_empty(),
749 )?;
750
751 let now = now_ms();
753 let interrupt_start = Instant::now();
754 match self.store.interrupt_detailed(&thread_id, now).await {
755 Ok(interrupt) => {
756 record_mailbox_operation_result("interrupt", "ok", interrupt_start);
757 crate::metrics::inc_mailbox_operation_by(
758 "supersede",
759 "ok",
760 interrupt.superseded_count as u64,
761 );
762 self.refresh_dispatch_depth_metrics().await;
763 for superseded_dispatch in &interrupt.superseded_dispatches {
764 self.mark_superseded_dispatch_run_cancelled(
765 superseded_dispatch,
766 "queued dispatch superseded by foreground submit",
767 )
768 .await;
769 }
770 if let Some(active_dispatch) = interrupt.active_dispatch.as_ref() {
772 let cancelled = self
773 .cancel_active_dispatch(&thread_id, active_dispatch, true)
774 .await?;
775 if !cancelled {
776 return Err(MailboxError::Validation(ACTIVE_RUN_CONFLICT_MESSAGE.into()));
777 }
778 tracing::info!(
779 thread_id = %thread_id,
780 superseded = interrupt.superseded_count,
781 "interrupted thread for new submission"
782 );
783 }
784 }
785 Err(e) => {
786 record_mailbox_operation_result("interrupt", "error", interrupt_start);
787 tracing::warn!(thread_id = %thread_id, error = %e, "interrupt failed, falling back to cancel");
788 if !self.executor.cancel_and_wait_by_thread(&thread_id).await {
789 return Err(MailboxError::Validation(ACTIVE_RUN_CONFLICT_MESSAGE.into()));
790 }
791 }
792 }
793
794 let run_id = self
795 .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
796 .await?;
797 let dispatch = self.build_dispatch(&request, &thread_id)?;
798 let dispatch_id = dispatch.dispatch_id.clone();
799 let thread_id = dispatch.thread_id.clone();
800
801 let mut wal_dispatch = dispatch;
806 wal_dispatch.available_at = now_ms() + INLINE_CLAIM_GUARD_MS;
807 self.enqueue_dispatch_with_metrics(&wal_dispatch).await?;
808
809 let now = now_ms();
811 let claim_start = Instant::now();
812 let claimed_result = self
813 .store
814 .claim_dispatch(&dispatch_id, &self.consumer_id, self.config.lease_ms, now)
815 .await;
816 let claim_result_label = match &claimed_result {
817 Ok(Some(_)) => "ok",
818 Ok(None) => "empty",
819 Err(_) => "error",
820 };
821 record_mailbox_operation_result("claim_dispatch", claim_result_label, claim_start);
822 let claimed = claimed_result?;
823 self.refresh_dispatch_depth_metrics().await;
824
825 let (event_tx, event_rx) = mpsc::channel(Self::EVENT_CHANNEL_CAPACITY);
826
827 if let Some(claimed_dispatch) = claimed {
828 let claim_token = claimed_dispatch.claim_token.clone().unwrap_or_default();
829
830 let suspended = Arc::new(AtomicBool::new(false));
832
833 let lease_handle = self.spawn_lease_renewal(
835 dispatch_id.clone(),
836 claim_token.clone(),
837 thread_id.clone(),
838 Arc::clone(&suspended),
839 );
840
841 let reconnectable_sink = Arc::new(ReconnectableEventSink::new(event_tx.clone()));
843
844 let thread_ctx = match ThreadContext::load(self.run_store.as_ref(), &thread_id).await {
846 Ok(ctx) => Some(ctx),
847 Err(e) => {
848 tracing::warn!(thread_id, error = %e, "failed to pre-warm thread context");
849 None
850 }
851 };
852
853 let worker = self.get_or_create_worker(&thread_id).await;
855 {
856 let mut w = worker.lock();
857 w.thread_ctx = thread_ctx;
858 w.status = MailboxWorkerStatus::Running {
859 dispatch_id: dispatch_id.clone(),
860 run_id: run_id.clone(),
861 lease_handle,
862 sink: Arc::clone(&reconnectable_sink),
863 };
864 }
865
866 self.spawn_execution(
868 claimed_dispatch,
869 event_tx.clone(),
870 reconnectable_sink,
871 claim_token,
872 thread_id.clone(),
873 suspended,
874 );
875
876 Ok((
877 MailboxSubmitResult {
878 dispatch_id,
879 run_id,
880 thread_id,
881 status: MailboxDispatchStatus::Running,
882 },
883 event_rx,
884 ))
885 } else {
886 let now_fix = now_ms();
890 let cancel_start = Instant::now();
891 let cancel_result = self.store.cancel(&dispatch_id, now_fix).await;
892 record_mailbox_operation_result("cancel", result_label(&cancel_result), cancel_start);
893 match cancel_result {
894 Ok(Some(cancelled_dispatch)) => {
895 self.mark_cancelled_dispatch_run_cancelled(
896 &cancelled_dispatch,
897 "inline dispatch cancelled after claim race",
898 )
899 .await;
900 self.refresh_dispatch_depth_metrics().await;
901 }
902 Ok(None) => {
903 if let Ok(Some(dispatch)) = self.store.load_dispatch(&dispatch_id).await {
904 self.reconcile_terminal_dispatch(&dispatch).await;
905 }
906 self.refresh_dispatch_depth_metrics().await;
907 }
908 Err(e) => {
909 tracing::warn!(dispatch_id, error = %e, "failed to cancel unclaimed inline dispatch");
910 }
911 }
912 Err(MailboxError::Validation(ACTIVE_RUN_CONFLICT_MESSAGE.into()))
913 }
914 }
915
916 #[tracing::instrument(skip(self, request), fields(thread_id = %request.thread_id))]
921 pub async fn submit_background(
922 self: &Arc<Self>,
923 mut request: RunRequest,
924 ) -> Result<MailboxSubmitResult, MailboxError> {
925 normalize_mailbox_run_mode(&mut request, true);
926 let (thread_id, messages) = validate_run_inputs(
927 request.thread_id.clone(),
928 request.messages.clone(),
929 !request.decisions.is_empty(),
930 )?;
931
932 let run_id = self
933 .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
934 .await?;
935 let dispatch = self.build_dispatch(&request, &thread_id)?;
936 let dispatch_id = dispatch.dispatch_id.clone();
937 let thread_id = dispatch.thread_id.clone();
938
939 self.enqueue_dispatch_with_metrics(&dispatch).await?;
941
942 self.get_or_create_worker(&thread_id).await;
944 let claimed = self.try_dispatch_next(&thread_id).await;
945 let status = if claimed.started_execution() {
946 MailboxDispatchStatus::Running
947 } else {
948 MailboxDispatchStatus::Queued
949 };
950
951 Ok(MailboxSubmitResult {
952 dispatch_id,
953 run_id,
954 thread_id,
955 status,
956 })
957 }
958
959 #[tracing::instrument(skip(self, request), fields(thread_id = %request.thread_id))]
985 pub async fn submit_live_then_queue(
986 self: &Arc<Self>,
987 mut request: RunRequest,
988 expected_run_id: Option<&str>,
989 ) -> Result<MailboxSubmitResult, MailboxError> {
990 let (thread_id, messages) = validate_run_inputs(
991 request.thread_id.clone(),
992 request.messages.clone(),
993 !request.decisions.is_empty(),
994 )?;
995 let messages = normalize_message_ids(&messages);
996
997 if let Some(result) = self
998 .try_deliver_live_messages(&thread_id, expected_run_id, messages.clone())
999 .await?
1000 {
1001 return Ok(result);
1002 }
1003
1004 request.thread_id = thread_id;
1005 request.messages = messages;
1006 self.submit_background(request).await
1007 }
1008
1009 pub async fn cancel(&self, id: &str) -> Result<bool, MailboxError> {
1016 let now = now_ms();
1018 let cancel_start = Instant::now();
1019 let cancel_result = self.store.cancel(id, now).await;
1020 record_mailbox_operation_result("cancel", result_label(&cancel_result), cancel_start);
1021 let cancelled = cancel_result?;
1022 if let Some(cancelled_dispatch) = cancelled {
1023 self.mark_cancelled_dispatch_run_cancelled(
1024 &cancelled_dispatch,
1025 "queued dispatch cancelled",
1026 )
1027 .await;
1028 self.refresh_dispatch_depth_metrics().await;
1029 return Ok(true);
1030 }
1031
1032 if self.executor.cancel(id) {
1034 return Ok(true);
1035 }
1036
1037 if let Some(dispatch) = self.store.load_dispatch(id).await?
1038 && dispatch.status == RunDispatchStatus::Claimed
1039 {
1040 return self
1041 .deliver_live_cancel(&live_target_for_dispatch(&dispatch))
1042 .await;
1043 }
1044
1045 let run = if let Some(run) = self.run_store.load_run(id).await? {
1046 Some(run)
1047 } else {
1048 self.run_store.latest_run(id).await?
1049 };
1050 if let Some(run) = run
1051 && matches!(run.status, RunStatus::Running | RunStatus::Waiting)
1052 {
1053 return self.deliver_live_cancel(&live_target_for_run(&run)).await;
1054 }
1055
1056 Ok(false)
1057 }
1058
1059 pub async fn interrupt(&self, thread_id: &str) -> Result<MailboxInterrupt, MailboxError> {
1062 self.interrupt_detailed(thread_id).await.map(Into::into)
1063 }
1064
1065 pub async fn interrupt_detailed(
1068 &self,
1069 thread_id: &str,
1070 ) -> Result<MailboxInterruptDetails, MailboxError> {
1071 let now = now_ms();
1072 let interrupt_start = Instant::now();
1073 let interrupt_result = self.store.interrupt_detailed(thread_id, now).await;
1074 record_mailbox_operation_result(
1075 "interrupt",
1076 result_label(&interrupt_result),
1077 interrupt_start,
1078 );
1079 let result = interrupt_result?;
1080 crate::metrics::inc_mailbox_operation_by("supersede", "ok", result.superseded_count as u64);
1081 self.refresh_dispatch_depth_metrics().await;
1082 for superseded_dispatch in &result.superseded_dispatches {
1083 self.mark_superseded_dispatch_run_cancelled(
1084 superseded_dispatch,
1085 "queued dispatch superseded by interrupt",
1086 )
1087 .await;
1088 }
1089
1090 if let Some(active_dispatch) = result.active_dispatch.as_ref() {
1092 self.cancel_active_dispatch(thread_id, active_dispatch, false)
1093 .await?;
1094 }
1095
1096 Ok(result)
1097 }
1098
1099 pub fn send_decision(&self, id: &str, tool_call_id: String, resume: ToolCallResume) -> bool {
1104 self.executor.send_decision(id, tool_call_id, resume)
1105 }
1106
1107 pub async fn send_decision_live(
1113 &self,
1114 id: &str,
1115 tool_call_id: String,
1116 resume: ToolCallResume,
1117 ) -> Result<bool, MailboxError> {
1118 if self
1119 .executor
1120 .send_decision(id, tool_call_id.clone(), resume.clone())
1121 {
1122 return Ok(true);
1123 }
1124
1125 if let Some(dispatch) = self.store.load_dispatch(id).await?
1126 && dispatch.status == RunDispatchStatus::Claimed
1127 {
1128 return self
1129 .deliver_live_decision(
1130 &live_target_for_dispatch(&dispatch),
1131 vec![(tool_call_id, resume)],
1132 )
1133 .await;
1134 }
1135
1136 let run = if let Some(run) = self.run_store.load_run(id).await? {
1137 Some(run)
1138 } else {
1139 self.run_store.latest_run(id).await?
1140 };
1141 if let Some(run) = run
1142 && matches!(run.status, RunStatus::Running | RunStatus::Waiting)
1143 {
1144 return self
1145 .deliver_live_decision(&live_target_for_run(&run), vec![(tool_call_id, resume)])
1146 .await;
1147 }
1148
1149 Ok(false)
1150 }
1151
1152 async fn cancel_active_dispatch(
1153 &self,
1154 thread_id: &str,
1155 active_dispatch: &RunDispatch,
1156 wait_for_release: bool,
1157 ) -> Result<bool, MailboxError> {
1158 if wait_for_release {
1159 if self.executor.cancel_and_wait_by_thread(thread_id).await {
1160 if self
1161 .wait_for_dispatch_not_claimed(&active_dispatch.dispatch_id)
1162 .await?
1163 {
1164 return Ok(true);
1165 }
1166 tracing::warn!(
1167 thread_id,
1168 dispatch_id = %active_dispatch.dispatch_id,
1169 "local cancel completed but active dispatch did not release before foreground submit"
1170 );
1171 return Ok(false);
1172 }
1173 } else if self.executor.cancel(thread_id) {
1174 return Ok(true);
1175 }
1176
1177 if !self
1178 .deliver_live_cancel(&live_target_for_dispatch(active_dispatch))
1179 .await?
1180 {
1181 return Ok(false);
1182 }
1183
1184 if wait_for_release
1185 && !self
1186 .wait_for_dispatch_not_claimed(&active_dispatch.dispatch_id)
1187 .await?
1188 {
1189 tracing::warn!(
1190 thread_id,
1191 dispatch_id = %active_dispatch.dispatch_id,
1192 "remote cancel delivered but active dispatch did not release before foreground submit"
1193 );
1194 return Ok(false);
1195 }
1196 Ok(true)
1197 }
1198
1199 async fn deliver_live_cancel(&self, target: &LiveRunTarget) -> Result<bool, MailboxError> {
1200 match self
1201 .store
1202 .deliver_live_to(target, LiveRunCommand::Cancel)
1203 .await?
1204 {
1205 LiveDeliveryOutcome::Delivered => Ok(true),
1206 LiveDeliveryOutcome::NoSubscriber => Ok(false),
1207 }
1208 }
1209
1210 async fn deliver_live_decision(
1211 &self,
1212 target: &LiveRunTarget,
1213 decisions: Vec<(String, ToolCallResume)>,
1214 ) -> Result<bool, MailboxError> {
1215 match self
1216 .store
1217 .deliver_live_to(target, LiveRunCommand::Decision(decisions))
1218 .await?
1219 {
1220 LiveDeliveryOutcome::Delivered => Ok(true),
1221 LiveDeliveryOutcome::NoSubscriber => Ok(false),
1222 }
1223 }
1224
1225 async fn wait_for_dispatch_not_claimed(&self, dispatch_id: &str) -> Result<bool, MailboxError> {
1226 let deadline = tokio::time::Instant::now() + Duration::from_millis(REMOTE_CANCEL_WAIT_MS);
1227 loop {
1228 match self.store.load_dispatch(dispatch_id).await? {
1229 Some(dispatch) if dispatch.status == RunDispatchStatus::Claimed => {}
1230 _ => return Ok(true),
1231 }
1232 if tokio::time::Instant::now() >= deadline {
1233 return Ok(false);
1234 }
1235 tokio::time::sleep(Duration::from_millis(REMOTE_CANCEL_POLL_MS)).await;
1236 }
1237 }
1238
1239 async fn mark_superseded_dispatch_run_cancelled(&self, dispatch: &RunDispatch, reason: &str) {
1240 self.mark_dispatch_run_cancelled("mark_run_superseded", "superseded", dispatch, reason)
1241 .await;
1242 }
1243
1244 async fn mark_cancelled_dispatch_run_cancelled(&self, dispatch: &RunDispatch, reason: &str) {
1245 self.mark_dispatch_run_cancelled("mark_run_cancelled", "cancelled", dispatch, reason)
1246 .await;
1247 }
1248
1249 async fn mark_dispatch_run_cancelled(
1250 &self,
1251 operation: &str,
1252 outcome: &str,
1253 dispatch: &RunDispatch,
1254 reason: &str,
1255 ) {
1256 let start = Instant::now();
1257 let result = self
1258 .mark_dispatch_run_cancelled_inner(dispatch, reason)
1259 .await;
1260 record_mailbox_operation_result(operation, result_label(&result), start);
1261 if matches!(result, Ok(true)) {
1262 record_mailbox_dispatch_terminal_metrics(dispatch, outcome);
1263 }
1264 if let Err(error) = result {
1265 tracing::warn!(
1266 dispatch_id = %dispatch.dispatch_id,
1267 run_id = %dispatch.run_id,
1268 thread_id = %dispatch.thread_id,
1269 reason,
1270 error = %error,
1271 "failed to mark terminal mailbox run as cancelled"
1272 );
1273 }
1274 }
1275
1276 async fn mark_dispatch_run_cancelled_inner(
1277 &self,
1278 dispatch: &RunDispatch,
1279 _reason: &str,
1280 ) -> Result<bool, MailboxError> {
1281 let Some(mut run) = self.run_store.load_run(&dispatch.run_id).await? else {
1282 return Ok(false);
1283 };
1284 if run.thread_id != dispatch.thread_id || run.status == RunStatus::Done {
1285 return Ok(false);
1286 }
1287
1288 let now = now_ms() / 1000;
1289 run.status = RunStatus::Done;
1290 run.termination_reason = Some(TerminationReason::Cancelled);
1291 run.error_payload = None;
1292 run.dispatch_id = Some(dispatch.dispatch_id.clone());
1293 run.session_id = dispatch.dispatch_instance_id.clone();
1294 run.waiting = None;
1295 run.finished_at = Some(now);
1296 run.updated_at = now;
1297
1298 self.checkpoint_terminal_dispatch_run(dispatch, &run)
1299 .await?;
1300 Ok(true)
1301 }
1302
1303 async fn mark_dead_letter_dispatch_run_error(&self, dispatch: &RunDispatch) {
1304 let start = Instant::now();
1305 let result = self
1306 .mark_dead_letter_dispatch_run_error_inner(dispatch)
1307 .await;
1308 record_mailbox_operation_result("mark_run_dead_letter", result_label(&result), start);
1309 if matches!(result, Ok(true)) {
1310 record_mailbox_dispatch_terminal_metrics(dispatch, "dead_letter");
1311 }
1312 if let Err(error) = result {
1313 tracing::warn!(
1314 dispatch_id = %dispatch.dispatch_id,
1315 run_id = %dispatch.run_id,
1316 thread_id = %dispatch.thread_id,
1317 error = %error,
1318 "failed to mark dead-lettered mailbox run as errored"
1319 );
1320 }
1321 }
1322
1323 async fn reconcile_terminal_dispatch(&self, dispatch: &RunDispatch) {
1324 match dispatch.status {
1325 RunDispatchStatus::DeadLetter => {
1326 self.mark_dead_letter_dispatch_run_error(dispatch).await;
1327 }
1328 RunDispatchStatus::Cancelled => {
1329 self.mark_cancelled_dispatch_run_cancelled(
1330 dispatch,
1331 "cancelled dispatch reclaimed during mailbox maintenance",
1332 )
1333 .await;
1334 }
1335 RunDispatchStatus::Superseded => {
1336 self.mark_superseded_dispatch_run_cancelled(
1337 dispatch,
1338 "superseded dispatch reclaimed during mailbox maintenance",
1339 )
1340 .await;
1341 }
1342 RunDispatchStatus::Queued | RunDispatchStatus::Claimed | RunDispatchStatus::Acked => {}
1343 }
1344 }
1345
1346 async fn reconcile_terminal_dispatches(&self) {
1347 let mut offset = 0;
1348 loop {
1349 let list_start = Instant::now();
1350 let result = self
1351 .store
1352 .list_terminal_dispatches(TERMINAL_RECONCILE_BATCH, offset)
1353 .await;
1354 record_mailbox_operation_result(
1355 "list_terminal_dispatches",
1356 result_label(&result),
1357 list_start,
1358 );
1359 let dispatches = match result {
1360 Ok(dispatches) => dispatches,
1361 Err(error) => {
1362 tracing::warn!(
1363 error = %error,
1364 "failed to list terminal mailbox dispatches for run reconciliation"
1365 );
1366 return;
1367 }
1368 };
1369 if dispatches.is_empty() {
1370 return;
1371 }
1372 crate::metrics::inc_mailbox_operation_by(
1373 "reconcile_terminal_dispatch",
1374 "ok",
1375 dispatches.len() as u64,
1376 );
1377 let page_len = dispatches.len();
1378 for dispatch in &dispatches {
1379 self.reconcile_terminal_dispatch(dispatch).await;
1380 }
1381 if page_len < TERMINAL_RECONCILE_BATCH {
1382 return;
1383 }
1384 offset += page_len;
1385 }
1386 }
1387
1388 async fn mark_dead_letter_dispatch_run_error_inner(
1389 &self,
1390 dispatch: &RunDispatch,
1391 ) -> Result<bool, MailboxError> {
1392 let Some(mut run) = self.run_store.load_run(&dispatch.run_id).await? else {
1393 return Ok(false);
1394 };
1395 if run.thread_id != dispatch.thread_id || run.status == RunStatus::Done {
1396 return Ok(false);
1397 }
1398
1399 let reason = dispatch
1400 .run_error
1401 .clone()
1402 .or_else(|| dispatch.last_error.clone())
1403 .unwrap_or_else(|| "mailbox dispatch dead-lettered".to_string());
1404 let now = now_ms() / 1000;
1405 run.status = RunStatus::Done;
1406 run.termination_reason = Some(TerminationReason::Error(reason.clone()));
1407 run.error_payload = Some(serde_json::json!({ "message": reason }));
1408 run.dispatch_id = Some(dispatch.dispatch_id.clone());
1409 run.session_id = dispatch.dispatch_instance_id.clone();
1410 run.waiting = None;
1411 run.finished_at = Some(now);
1412 run.updated_at = now;
1413
1414 self.checkpoint_terminal_dispatch_run(dispatch, &run)
1415 .await?;
1416 Ok(true)
1417 }
1418
1419 async fn checkpoint_terminal_dispatch_run(
1420 &self,
1421 dispatch: &RunDispatch,
1422 run: &RunRecord,
1423 ) -> Result<(), MailboxError> {
1424 let messages = self
1425 .run_store
1426 .load_messages(&dispatch.thread_id)
1427 .await?
1428 .unwrap_or_default();
1429 self.run_store
1430 .checkpoint(&dispatch.thread_id, &messages, run)
1431 .await?;
1432 {
1433 let workers = self.workers.read().await;
1434 if let Some(worker) = workers.get(&dispatch.thread_id) {
1435 let mut worker = worker.lock();
1436 if let Some(ref mut ctx) = worker.thread_ctx {
1437 ctx.apply_checkpoint(&messages, run);
1438 }
1439 }
1440 }
1441 Ok(())
1442 }
1443
1444 async fn try_deliver_live_messages(
1445 &self,
1446 thread_id: &str,
1447 expected_run_id: Option<&str>,
1448 messages: Vec<Message>,
1449 ) -> Result<Option<MailboxSubmitResult>, MailboxError> {
1450 if messages.is_empty() {
1451 return Ok(None);
1452 }
1453
1454 let local_active = {
1455 let workers = self.workers.read().await;
1456 workers.get(thread_id).and_then(|worker| {
1457 let worker = worker.lock();
1458 match &worker.status {
1459 MailboxWorkerStatus::Running {
1460 dispatch_id,
1461 run_id,
1462 ..
1463 } => Some((dispatch_id.clone(), run_id.clone())),
1464 MailboxWorkerStatus::Idle | MailboxWorkerStatus::Claiming => None,
1465 }
1466 })
1467 };
1468
1469 if let Some((active_dispatch_id, active_run_id)) = local_active {
1470 if expected_run_id.is_some_and(|expected| expected != active_run_id) {
1472 return Ok(None);
1473 }
1474 if !self.executor.send_messages(&active_run_id, messages) {
1479 return Ok(None);
1480 }
1481 return Ok(Some(MailboxSubmitResult {
1482 dispatch_id: active_dispatch_id,
1483 run_id: active_run_id,
1484 thread_id: thread_id.to_string(),
1485 status: MailboxDispatchStatus::Running,
1486 }));
1487 }
1488
1489 let Some(remote_run) = self.run_store.latest_run(thread_id).await? else {
1493 return Ok(None);
1494 };
1495 if remote_run.status != RunStatus::Running {
1496 return Ok(None);
1497 }
1498 if expected_run_id.is_some_and(|expected| expected != remote_run.run_id) {
1499 return Ok(None);
1500 }
1501
1502 let outcome = self
1507 .store
1508 .deliver_live_to(
1509 &live_target_for_run(&remote_run),
1510 awaken_contract::contract::mailbox::LiveRunCommand::Messages(messages),
1511 )
1512 .await?;
1513 match outcome {
1514 awaken_contract::contract::mailbox::LiveDeliveryOutcome::Delivered => {}
1515 awaken_contract::contract::mailbox::LiveDeliveryOutcome::NoSubscriber => {
1516 return Ok(None);
1517 }
1518 }
1519
1520 let dispatch_id = remote_run
1521 .dispatch_id
1522 .clone()
1523 .unwrap_or_else(|| remote_run.run_id.clone());
1524 Ok(Some(MailboxSubmitResult {
1525 dispatch_id,
1526 run_id: remote_run.run_id,
1527 thread_id: thread_id.to_string(),
1528 status: MailboxDispatchStatus::Running,
1529 }))
1530 }
1531
1532 pub async fn reconnect_sink(&self, thread_id: &str, new_tx: mpsc::Sender<AgentEvent>) -> bool {
1537 let workers = self.workers.read().await;
1538 let Some(worker) = workers.get(thread_id) else {
1539 return false;
1540 };
1541 let w = worker.lock();
1542 match &w.status {
1543 MailboxWorkerStatus::Running { sink, .. } => {
1544 sink.reconnect(new_tx);
1545 true
1546 }
1547 MailboxWorkerStatus::Idle | MailboxWorkerStatus::Claiming => false,
1548 }
1549 }
1550
1551 async fn reusable_waiting_run_id(&self, thread_id: &str) -> Option<String> {
1552 if let Some(thread) = self.run_store.load_thread(thread_id).await.ok().flatten()
1553 && let Some(open_run_id) = thread.open_run_id.as_deref()
1554 && let Some(run) = self.run_store.load_run(open_run_id).await.ok().flatten()
1555 && run.thread_id == thread_id
1556 && run.is_resumable_waiting()
1557 {
1558 return Some(run.run_id);
1559 }
1560 let run = self.run_store.latest_run(thread_id).await.ok().flatten()?;
1561 run.is_resumable_waiting().then_some(run.run_id)
1562 }
1563
1564 pub async fn list_dispatches(
1568 &self,
1569 thread_id: &str,
1570 status_filter: Option<&[RunDispatchStatus]>,
1571 limit: usize,
1572 offset: usize,
1573 ) -> Result<Vec<RunDispatch>, MailboxError> {
1574 Ok(self
1575 .store
1576 .list_dispatches(thread_id, status_filter, limit, offset)
1577 .await?)
1578 }
1579
1580 pub async fn queued_thread_ids(&self) -> Result<Vec<String>, MailboxError> {
1582 Ok(self.store.queued_thread_ids().await?)
1583 }
1584
1585 pub async fn load_dispatch(
1586 &self,
1587 dispatch_id: &str,
1588 ) -> Result<Option<RunDispatch>, MailboxError> {
1589 Ok(self.store.load_dispatch(dispatch_id).await?)
1590 }
1591
1592 pub fn start_lifecycle(
1608 self: &Arc<Self>,
1609 config: MailboxLifecycleConfig,
1610 ) -> Result<MailboxLifecycleHandle, MailboxError> {
1611 let handle = MailboxLifecycleHandle {
1612 tasks: Arc::clone(&self.lifecycle_tasks),
1613 transition_lock: Arc::clone(&self.lifecycle_start_lock),
1614 };
1615 for _ in 0..16 {
1616 match self.lifecycle_start_lock.try_lock() {
1617 Ok(_transition_guard) => return self.start_lifecycle_internal(config, true),
1618 Err(_) if self.lifecycle_is_running()? => return Ok(handle),
1619 Err(_) => std::thread::yield_now(),
1620 }
1621 }
1622 Err(MailboxError::Internal(
1623 "mailbox lifecycle transition is already running".to_string(),
1624 ))
1625 }
1626
1627 pub async fn start_lifecycle_ready(
1635 self: &Arc<Self>,
1636 mut config: MailboxLifecycleConfig,
1637 ) -> Result<MailboxLifecycleHandle, MailboxError> {
1638 let _start_guard = self.lifecycle_start_lock.lock().await;
1639 let handle = MailboxLifecycleHandle {
1640 tasks: Arc::clone(&self.lifecycle_tasks),
1641 transition_lock: Arc::clone(&self.lifecycle_start_lock),
1642 };
1643 if self.lifecycle_is_running()? {
1644 return Ok(handle);
1645 }
1646
1647 if !config.startup_delay.is_zero() {
1648 tokio::time::sleep(config.startup_delay).await;
1649 config.startup_delay = Duration::ZERO;
1650 }
1651
1652 self.run_startup_recovery_with_retry(config.startup_recovery.clone())
1653 .await?;
1654 self.start_lifecycle_internal(config, false)
1655 }
1656
1657 fn lifecycle_is_running(&self) -> Result<bool, MailboxError> {
1658 Ok(self
1659 .lifecycle_tasks
1660 .lock()
1661 .map_err(|_| MailboxError::Internal("mailbox lifecycle lock poisoned".to_string()))?
1662 .is_some())
1663 }
1664
1665 fn start_lifecycle_internal(
1666 self: &Arc<Self>,
1667 config: MailboxLifecycleConfig,
1668 run_startup_recovery: bool,
1669 ) -> Result<MailboxLifecycleHandle, MailboxError> {
1670 let handle = MailboxLifecycleHandle {
1671 tasks: Arc::clone(&self.lifecycle_tasks),
1672 transition_lock: Arc::clone(&self.lifecycle_start_lock),
1673 };
1674 let mut lifecycle = self
1675 .lifecycle_tasks
1676 .lock()
1677 .map_err(|_| MailboxError::Internal("mailbox lifecycle lock poisoned".to_string()))?;
1678
1679 if lifecycle.is_some() {
1680 return Ok(handle);
1681 }
1682
1683 let startup_delay = config.startup_delay;
1684 let startup_recovery = config.startup_recovery.clone();
1685 let recover_mailbox = Arc::clone(self);
1686 let recover_task = run_startup_recovery.then(|| {
1687 tokio::spawn(async move {
1688 if !startup_delay.is_zero() {
1689 tokio::time::sleep(startup_delay).await;
1690 }
1691 match recover_mailbox
1692 .run_startup_recovery_with_retry(startup_recovery)
1693 .await
1694 {
1695 Ok(recovered) => {
1696 tracing::info!(recovered, "mailbox startup recovery completed");
1697 }
1698 Err(error) => {
1699 tracing::error!(error = %error, "mailbox startup recovery failed");
1700 }
1701 }
1702 })
1703 });
1704
1705 let maintenance_mailbox = Arc::clone(self);
1706 let maintenance_callback = config.maintenance_callback;
1707 let maintenance_task = tokio::spawn(async move {
1708 if !startup_delay.is_zero() {
1709 tokio::time::sleep(startup_delay).await;
1710 }
1711 maintenance_mailbox
1712 .run_maintenance_loop(maintenance_callback)
1713 .await;
1714 });
1715
1716 let dispatch_signal_task = self.store.supports_dispatch_signals().then(|| {
1717 let signal_mailbox = Arc::clone(self);
1718 tokio::spawn(async move {
1719 if !startup_delay.is_zero() {
1720 tokio::time::sleep(startup_delay).await;
1721 }
1722 signal_mailbox.run_dispatch_signal_loop().await;
1723 })
1724 });
1725
1726 *lifecycle = Some(MailboxLifecycleTasks {
1727 recover_task,
1728 dispatch_signal_task,
1729 maintenance_task,
1730 });
1731 Ok(handle)
1732 }
1733
1734 async fn run_startup_recovery_with_retry(
1735 self: &Arc<Self>,
1736 config: MailboxStartupRecoveryConfig,
1737 ) -> Result<usize, MailboxError> {
1738 let max_attempts = config.max_attempts.max(1);
1739 for attempt in 1..=max_attempts {
1740 match self.recover().await {
1741 Ok(recovered) => return Ok(recovered),
1742 Err(error) if attempt < max_attempts => {
1743 tracing::warn!(
1744 attempt,
1745 max_attempts,
1746 retry_delay_ms = config.retry_delay.as_millis(),
1747 error = %error,
1748 "mailbox startup recovery failed; retrying"
1749 );
1750 if !config.retry_delay.is_zero() {
1751 tokio::time::sleep(config.retry_delay).await;
1752 }
1753 }
1754 Err(error) => return Err(error),
1755 }
1756 }
1757 unreachable!("max_attempts is normalized to at least one")
1758 }
1759
1760 #[tracing::instrument(skip(self))]
1762 pub async fn recover(self: &Arc<Self>) -> Result<usize, MailboxError> {
1763 let now = now_ms();
1764 let mut total = 0;
1765
1766 let reclaim_start = Instant::now();
1768 let reclaimed_result = self.store.reclaim_expired_leases(now, 100).await;
1769 record_mailbox_operation_result("reclaim", result_label(&reclaimed_result), reclaim_start);
1770 let reclaimed = reclaimed_result?;
1771 crate::metrics::inc_mailbox_operation_by("reclaim_dispatch", "ok", reclaimed.len() as u64);
1772 if !reclaimed.is_empty() {
1773 self.refresh_dispatch_depth_metrics().await;
1774 }
1775 for dispatch in &reclaimed {
1776 self.reconcile_terminal_dispatch(dispatch).await;
1777 }
1778 self.reconcile_terminal_dispatches().await;
1779 total += reclaimed.len();
1780
1781 let thread_ids = self.store.queued_thread_ids().await?;
1783 for thread_id in &thread_ids {
1784 self.get_or_create_worker(thread_id).await;
1786 self.try_dispatch_next(thread_id).await;
1787 }
1788
1789 {
1791 let query = awaken_contract::contract::storage::RunQuery {
1792 status: Some(awaken_contract::contract::lifecycle::RunStatus::Waiting),
1793 limit: 200,
1794 ..Default::default()
1795 };
1796 if let Ok(page) = self.run_store.list_runs(&query).await {
1797 let queued_set: std::collections::HashSet<String> =
1798 thread_ids.iter().cloned().collect();
1799 for run in &page.items {
1800 if !run.is_background_task_waiting() {
1801 continue;
1802 }
1803 if queued_set.contains(&run.thread_id) {
1805 continue;
1806 }
1807 let request = RunRequest::new(
1808 run.thread_id.clone(),
1809 vec![Message::internal_user("<background-tasks-updated />")],
1810 )
1811 .with_agent_id(run.agent_id.clone())
1812 .with_continue_run_id(run.run_id.clone())
1813 .with_origin(awaken_contract::contract::storage::RunRequestOrigin::Internal)
1814 .with_run_mode(RunMode::InternalWake)
1815 .with_adapter(AdapterKind::Internal);
1816 if self.submit_background(request).await.is_ok() {
1817 total += 1;
1818 tracing::info!(
1819 thread_id = %run.thread_id,
1820 run_id = %run.run_id,
1821 "recover: enqueued wake dispatch for orphaned background-task thread"
1822 );
1823 }
1824 }
1825 }
1826 }
1827
1828 Ok(total)
1829 }
1830
1831 pub async fn run_maintenance_loop(
1836 self: Arc<Self>,
1837 maintenance_callback: Option<MailboxMaintenanceCallback>,
1838 ) {
1839 let mut sweep_interval = tokio::time::interval(self.config.sweep_interval);
1840 let mut gc_interval = tokio::time::interval(self.config.gc_interval);
1841
1842 sweep_interval.tick().await;
1844 gc_interval.tick().await;
1845
1846 loop {
1847 tokio::select! {
1848 _ = sweep_interval.tick() => {
1849 self.run_sweep().await;
1850 }
1851 _ = gc_interval.tick() => {
1852 self.run_gc().await;
1853 if let Some(cleanup) = &maintenance_callback {
1854 cleanup();
1855 }
1856 }
1857 }
1858 }
1859 }
1860
1861 pub async fn run_dispatch_signal_loop(self: Arc<Self>) {
1863 loop {
1864 let pull_start = Instant::now();
1865 let pull_result = self
1866 .store
1867 .pull_dispatch_signals(
1868 dispatch_signal_batch_size(),
1869 dispatch_signal_fetch_expires(),
1870 )
1871 .await;
1872 record_mailbox_operation_result("signal_pull", result_label(&pull_result), pull_start);
1873 match pull_result {
1874 Ok(entries) => {
1875 crate::metrics::inc_mailbox_dispatch_signal_pulled_by(entries.len() as u64);
1876 self.handle_dispatch_signal_entries(entries).await;
1877 }
1878 Err(error) => {
1879 tracing::warn!(error = %error, "dispatch signal pull failed");
1880 tokio::time::sleep(DISPATCH_SIGNAL_ERROR_DELAY).await;
1881 }
1882 }
1883 }
1884 }
1885
1886 async fn handle_dispatch_signal_entries(self: &Arc<Self>, entries: Vec<DispatchSignalEntry>) {
1887 if entries.is_empty() {
1888 return;
1889 }
1890 let max_concurrent = dispatch_signal_max_concurrent_handlers()
1891 .min(entries.len())
1892 .max(1);
1893 let semaphore = Arc::new(Semaphore::new(max_concurrent));
1894 let mut tasks = JoinSet::new();
1895 for entry in entries {
1896 let Ok(permit) = Arc::clone(&semaphore).acquire_owned().await else {
1897 tracing::warn!("dispatch signal concurrency limiter closed");
1898 break;
1899 };
1900 let mailbox = Arc::clone(self);
1901 tasks.spawn(async move {
1902 let _permit = permit;
1903 mailbox.handle_dispatch_signal_entry(entry).await;
1904 });
1905 }
1906 while let Some(result) = tasks.join_next().await {
1907 if let Err(error) = result {
1908 tracing::warn!(error = %error, "dispatch signal handler task failed");
1909 }
1910 }
1911 }
1912
1913 async fn handle_dispatch_signal_entry(self: Arc<Self>, entry: DispatchSignalEntry) {
1914 let redelivery_attempts = entry.receipt.redelivery_attempts();
1915 if redelivery_attempts.is_some_and(|attempts| attempts > 1) {
1916 crate::metrics::inc_mailbox_dispatch_signal_redelivery();
1917 }
1918 self.get_or_create_worker(&entry.thread_id).await;
1919 let attempt = self.try_dispatch_next(&entry.thread_id).await;
1920 let nack_delay = match attempt {
1921 DispatchAttempt::TransientError => Some(None),
1922 DispatchAttempt::NoEligible => {
1923 match self.dispatch_signal_still_available(&entry).await {
1924 Ok(true) => Some(Some(dispatch_signal_blocked_nack_delay(
1925 redelivery_attempts,
1926 ))),
1927 Ok(false) => None,
1928 Err(error) => {
1929 tracing::warn!(
1930 thread_id = %entry.thread_id,
1931 dispatch_id = %entry.dispatch_id,
1932 error = %error,
1933 "failed to verify unclaimed dispatch signal"
1934 );
1935 Some(None)
1936 }
1937 }
1938 }
1939 DispatchAttempt::Claimed | DispatchAttempt::Busy => None,
1940 };
1941 if let Some(delay) = nack_delay {
1942 let nack_start = Instant::now();
1943 let result = if let Some(delay) = delay {
1944 entry.receipt.nack_with_delay(delay).await
1945 } else {
1946 entry.receipt.nack().await
1947 };
1948 record_mailbox_operation_result("signal_nack", result_label(&result), nack_start);
1949 if result.is_ok() {
1950 crate::metrics::inc_mailbox_dispatch_signal_nack(delay.is_some());
1951 }
1952 if let Err(error) = result {
1953 tracing::warn!(
1954 thread_id = %entry.thread_id,
1955 dispatch_id = %entry.dispatch_id,
1956 error = %error,
1957 "failed to nack dispatch signal after claim error"
1958 );
1959 }
1960 return;
1961 }
1962 let ack_start = Instant::now();
1963 let ack_result = entry.receipt.ack().await;
1964 record_mailbox_operation_result("signal_ack", result_label(&ack_result), ack_start);
1965 if ack_result.is_ok() {
1966 crate::metrics::inc_mailbox_dispatch_signal_ack();
1967 }
1968 if let Err(error) = ack_result {
1969 tracing::warn!(
1970 thread_id = %entry.thread_id,
1971 dispatch_id = %entry.dispatch_id,
1972 error = %error,
1973 "failed to ack dispatch signal"
1974 );
1975 }
1976 }
1977
1978 async fn dispatch_signal_still_available(
1979 &self,
1980 entry: &awaken_contract::contract::mailbox::DispatchSignalEntry,
1981 ) -> Result<bool, StorageError> {
1982 let now = now_ms();
1983 let Some(dispatch) = self.store.load_dispatch(&entry.dispatch_id).await? else {
1984 return Ok(false);
1985 };
1986 Ok(dispatch.status == RunDispatchStatus::Queued && dispatch.available_at <= now)
1987 }
1988
1989 #[tracing::instrument(skip(self), fields(thread_id = %thread_id))]
1993 async fn dispatch_next_claim(self: &Arc<Self>, thread_id: &str) -> DispatchAttempt {
1994 let now = now_ms();
1995 let claim_start = Instant::now();
1996 let claim_result = self
1997 .store
1998 .claim(thread_id, &self.consumer_id, self.config.lease_ms, now, 1)
1999 .await;
2000 let claim_result_label = match &claim_result {
2001 Ok(claimed) if claimed.is_empty() => "empty",
2002 Ok(_) => "ok",
2003 Err(_) => "error",
2004 };
2005 record_mailbox_operation_result("claim", claim_result_label, claim_start);
2006 let claimed = match claim_result {
2007 Ok(c) => {
2008 self.refresh_dispatch_depth_metrics().await;
2009 c
2010 }
2011 Err(e) => {
2012 tracing::warn!(error = %e, thread_id, "failed to claim dispatch");
2013 revert_claiming_to_idle(&self.workers, thread_id).await;
2014 return DispatchAttempt::TransientError;
2015 }
2016 };
2017
2018 let Some(dispatch) = claimed.into_iter().next() else {
2019 revert_claiming_to_idle(&self.workers, thread_id).await;
2021 return DispatchAttempt::NoEligible;
2022 };
2023
2024 let dispatch_id = dispatch.dispatch_id.clone();
2025 let claim_token = dispatch.claim_token.clone().unwrap_or_default();
2026
2027 let suspended = Arc::new(AtomicBool::new(false));
2029
2030 let lease_handle = self.spawn_lease_renewal(
2032 dispatch_id.clone(),
2033 claim_token.clone(),
2034 thread_id.to_string(),
2035 Arc::clone(&suspended),
2036 );
2037
2038 let thread_ctx = match ThreadContext::load(self.run_store.as_ref(), thread_id).await {
2040 Ok(ctx) => Some(ctx),
2041 Err(e) => {
2042 tracing::warn!(thread_id, error = %e, "failed to pre-warm thread context");
2043 None
2044 }
2045 };
2046
2047 let (event_tx, _event_rx) = mpsc::channel(Self::EVENT_CHANNEL_CAPACITY);
2049 let reconnectable_sink = Arc::new(ReconnectableEventSink::new(event_tx.clone()));
2050
2051 let worker = self.get_or_create_worker(thread_id).await;
2053 {
2054 let mut w = worker.lock();
2055 w.thread_ctx = thread_ctx;
2056 w.status = MailboxWorkerStatus::Running {
2057 dispatch_id: dispatch_id.clone(),
2058 run_id: dispatch.run_id.clone(),
2059 lease_handle,
2060 sink: Arc::clone(&reconnectable_sink),
2061 };
2062 }
2063
2064 self.spawn_execution(
2065 dispatch,
2066 event_tx,
2067 reconnectable_sink,
2068 claim_token,
2069 thread_id.to_string(),
2070 suspended,
2071 );
2072 DispatchAttempt::Claimed
2073 }
2074
2075 #[tracing::instrument(skip(self), fields(thread_id = %thread_id))]
2077 async fn try_dispatch_next(self: &Arc<Self>, thread_id: &str) -> DispatchAttempt {
2078 let worker = {
2079 let workers = self.workers.read().await;
2080 match workers.get(thread_id) {
2081 Some(w) => Arc::clone(w),
2082 None => return DispatchAttempt::NoEligible,
2083 }
2084 };
2085
2086 {
2088 let mut w = worker.lock();
2089 if !matches!(w.status, MailboxWorkerStatus::Idle) {
2090 return DispatchAttempt::Busy;
2091 }
2092 w.status = MailboxWorkerStatus::Claiming;
2093 }
2094
2095 self.dispatch_next_claim(thread_id).await
2096 }
2097
2098 fn spawn_lease_renewal(
2104 &self,
2105 dispatch_id: String,
2106 claim_token: String,
2107 thread_id: String,
2108 suspended: Arc<AtomicBool>,
2109 ) -> JoinHandle<()> {
2110 let store = Arc::clone(&self.store);
2111 let runtime = Arc::clone(&self.executor);
2112 let lease_ms = self.config.lease_ms;
2113 let suspended_lease_ms = self.config.suspended_lease_ms;
2114 let interval = self.config.lease_renewal_interval;
2115
2116 tokio::spawn(async move {
2117 let mut tick = tokio::time::interval(interval);
2118 tick.tick().await; loop {
2121 tick.tick().await;
2122 let now = now_ms();
2123 let effective_lease_ms = if suspended.load(Ordering::Acquire) {
2124 suspended_lease_ms
2125 } else {
2126 lease_ms
2127 };
2128 let renew_start = Instant::now();
2129 match store
2130 .extend_lease(&dispatch_id, &claim_token, effective_lease_ms, now)
2131 .await
2132 {
2133 Ok(true) => {
2134 record_mailbox_operation_result("lease_renewal", "ok", renew_start);
2135 }
2136 Ok(false) => {
2137 record_mailbox_operation_result("lease_renewal", "lost", renew_start);
2138 tracing::warn!(dispatch_id, thread_id, "lease lost, cancelling run");
2140 runtime.cancel(&thread_id);
2141 break;
2142 }
2143 Err(e) => {
2144 record_mailbox_operation_result("lease_renewal", "error", renew_start);
2145 tracing::warn!(dispatch_id, error = %e, "lease extension failed");
2146 break;
2147 }
2148 }
2149 }
2150 })
2151 }
2152
2153 #[tracing::instrument(skip(self, event_tx, reconnectable_sink, suspended), fields(dispatch_id = %dispatch.dispatch_id, thread_id = %thread_id))]
2155 fn spawn_execution(
2156 self: &Arc<Self>,
2157 dispatch: RunDispatch,
2158 event_tx: mpsc::Sender<AgentEvent>,
2159 reconnectable_sink: Arc<ReconnectableEventSink>,
2160 claim_token: String,
2161 thread_id: String,
2162 suspended: Arc<AtomicBool>,
2163 ) {
2164 let this = Arc::clone(self);
2165 let dispatch_id = dispatch.dispatch_id.clone();
2166
2167 tokio::spawn(async move {
2168 crate::metrics::inc_active_runs();
2169 let _guard = ActiveRunGuard;
2170
2171 let sink = SuspensionAwareSink {
2172 inner: reconnectable_sink as Arc<dyn EventSink>,
2173 suspended,
2174 };
2175
2176 let load_start = Instant::now();
2179 let current_dispatch_result = this.store.load_dispatch(&dispatch_id).await;
2180 record_mailbox_operation_result(
2181 "load_dispatch",
2182 result_label(¤t_dispatch_result),
2183 load_start,
2184 );
2185 let current_dispatch = match current_dispatch_result {
2186 Ok(Some(current_dispatch)) => current_dispatch,
2187 Ok(None) => {
2188 tracing::info!(dispatch_id, "dispatch disappeared before execution");
2189 this.finish_execution(&thread_id, &dispatch_id).await;
2190 return;
2191 }
2192 Err(error) => {
2193 tracing::warn!(dispatch_id, error = %error, "failed to verify dispatch before execution");
2194 this.finish_execution(&thread_id, &dispatch_id).await;
2195 return;
2196 }
2197 };
2198 if current_dispatch.status != RunDispatchStatus::Claimed
2199 || current_dispatch.claim_token.as_deref() != Some(claim_token.as_str())
2200 {
2201 tracing::info!(dispatch_id, status = ?current_dispatch.status, "dispatch no longer owned by this worker, skipping execution");
2202 if current_dispatch.status == RunDispatchStatus::Superseded {
2203 this.mark_superseded_dispatch_run_cancelled(
2204 ¤t_dispatch,
2205 "dispatch superseded before execution start",
2206 )
2207 .await;
2208 }
2209 this.finish_execution(&thread_id, &dispatch_id).await;
2210 return;
2211 }
2212 let epoch_start = Instant::now();
2213 let current_epoch_result = this.store.current_dispatch_epoch(&thread_id).await;
2214 record_mailbox_operation_result(
2215 "current_dispatch_epoch",
2216 result_label(¤t_epoch_result),
2217 epoch_start,
2218 );
2219 match current_epoch_result {
2220 Ok(current_epoch) if current_dispatch.dispatch_epoch < current_epoch => {
2221 tracing::info!(
2222 dispatch_id,
2223 thread_id,
2224 dispatch_epoch = current_dispatch.dispatch_epoch,
2225 current_epoch,
2226 "dispatch superseded before execution start"
2227 );
2228 let supersede_reason = "claimed dispatch superseded before execution start";
2229 let supersede_start = Instant::now();
2230 let supersede_result = this
2231 .store
2232 .supersede_claimed(&dispatch_id, &claim_token, now_ms(), supersede_reason)
2233 .await;
2234 record_mailbox_operation_result(
2235 "supersede_claimed",
2236 result_label(&supersede_result),
2237 supersede_start,
2238 );
2239 if supersede_result.is_ok() {
2240 this.refresh_dispatch_depth_metrics().await;
2241 this.mark_superseded_dispatch_run_cancelled(
2242 ¤t_dispatch,
2243 supersede_reason,
2244 )
2245 .await;
2246 }
2247 this.finish_execution(&thread_id, &dispatch_id).await;
2248 return;
2249 }
2250 Ok(_) => {}
2251 Err(error) => {
2252 tracing::warn!(dispatch_id, thread_id, error = %error, "failed to read dispatch epoch before execution");
2253 this.finish_execution(&thread_id, &dispatch_id).await;
2254 return;
2255 }
2256 }
2257
2258 let dispatch_instance_id = uuid::Uuid::now_v7().to_string();
2259 let start_now = now_ms();
2260 record_mailbox_dispatch_start_metrics(&dispatch, start_now);
2261 let mut request = match this.reconstruct_run_request(&dispatch).await {
2262 Ok(request) => request,
2263 Err(error) => {
2264 tracing::error!(dispatch_id, error = %error, "failed to reconstruct run request from durable run record");
2265 let now = now_ms();
2266 record_mailbox_dispatch_completion_metrics(
2267 &dispatch,
2268 start_now,
2269 now,
2270 "permanent_error",
2271 );
2272 let msg = error.to_string();
2273 let run_result = RunDispatchResult {
2274 run_id: dispatch.run_id.clone(),
2275 dispatch_instance_id: dispatch_instance_id.clone(),
2276 status: awaken_contract::contract::lifecycle::RunStatus::Done,
2277 termination: Some(
2278 awaken_contract::contract::lifecycle::TerminationReason::Error(
2279 msg.clone(),
2280 ),
2281 ),
2282 response: None,
2283 error: Some(msg.clone()),
2284 };
2285 let record_start = Instant::now();
2286 let record_result = this
2287 .store
2288 .record_dispatch_start(
2289 &dispatch_id,
2290 &claim_token,
2291 &dispatch_instance_id,
2292 start_now,
2293 )
2294 .await;
2295 record_mailbox_operation_result(
2296 "record_dispatch_start",
2297 result_label(&record_result),
2298 record_start,
2299 );
2300 if let Err(error) = record_result {
2301 tracing::warn!(dispatch_id, error = %error, "failed to record dispatch start for reconstruction failure");
2302 if let Ok(Some(latest_dispatch)) =
2303 this.store.load_dispatch(&dispatch_id).await
2304 && latest_dispatch.status == RunDispatchStatus::Superseded
2305 {
2306 this.mark_superseded_dispatch_run_cancelled(
2307 &latest_dispatch,
2308 "dispatch superseded before reconstruction failure was recorded",
2309 )
2310 .await;
2311 }
2312 this.finish_execution(&thread_id, &dispatch_id).await;
2313 return;
2314 }
2315 let record_result_start = Instant::now();
2316 let record_run_result = this
2317 .store
2318 .record_run_result(&dispatch_id, &claim_token, &run_result, now)
2319 .await;
2320 record_mailbox_operation_result(
2321 "record_run_result",
2322 result_label(&record_run_result),
2323 record_result_start,
2324 );
2325 let dead_letter_start = Instant::now();
2326 let dead_letter_result = this
2327 .store
2328 .dead_letter(&dispatch_id, &claim_token, &msg, now)
2329 .await;
2330 record_mailbox_operation_result(
2331 "dead_letter",
2332 result_label(&dead_letter_result),
2333 dead_letter_start,
2334 );
2335 if dead_letter_result.is_ok() {
2336 this.refresh_dispatch_depth_metrics().await;
2337 if let Ok(Some(dead_letter_dispatch)) =
2338 this.store.load_dispatch(&dispatch_id).await
2339 && dead_letter_dispatch.status == RunDispatchStatus::DeadLetter
2340 {
2341 this.mark_dead_letter_dispatch_run_error(&dead_letter_dispatch)
2342 .await;
2343 }
2344 }
2345 this.finish_execution(&thread_id, &dispatch_id).await;
2346 return;
2347 }
2348 };
2349 normalize_mailbox_run_mode(&mut request, false);
2350 let run_id = dispatch.run_id.clone();
2351 request = request
2352 .with_dispatch_id(dispatch_id.clone())
2353 .with_session_id(dispatch_instance_id.clone());
2354 let record_start = Instant::now();
2355 let record_start_result = this
2356 .store
2357 .record_dispatch_start(&dispatch_id, &claim_token, &dispatch_instance_id, start_now)
2358 .await;
2359 record_mailbox_operation_result(
2360 "record_dispatch_start",
2361 result_label(&record_start_result),
2362 record_start,
2363 );
2364 if let Err(e) = record_start_result {
2365 tracing::warn!(dispatch_id, run_id, error = %e, "failed to record mailbox dispatch start; skipping execution");
2366 if let Ok(Some(latest_dispatch)) = this.store.load_dispatch(&dispatch_id).await
2367 && latest_dispatch.status == RunDispatchStatus::Superseded
2368 {
2369 this.mark_superseded_dispatch_run_cancelled(
2370 &latest_dispatch,
2371 "dispatch superseded before runtime start was recorded",
2372 )
2373 .await;
2374 }
2375 this.finish_execution(&thread_id, &dispatch_id).await;
2376 return;
2377 }
2378 let thread_ctx = {
2379 let workers = this.workers.read().await;
2380 workers.get(&thread_id).and_then(|worker| {
2381 let w = worker.lock();
2382 w.thread_ctx.as_ref().map(|ctx| {
2383 ThreadContextSnapshot::new(
2384 ctx.messages.clone(),
2385 ctx.latest_run.clone(),
2386 ctx.run_cache.clone(),
2387 )
2388 })
2389 })
2390 };
2391 let continue_run_id = request.continue_run_id.clone();
2392 let (inbox_sender, inbox_receiver) = awaken_runtime::inbox::inbox_channel_with_fallback(
2393 Arc::new(TaskDoneMailboxNotify::new(
2394 this.clone(),
2395 dispatch.thread_id.clone(),
2396 continue_run_id,
2397 )),
2398 );
2399 request = request.with_inbox(inbox_sender, inbox_receiver);
2400
2401 let result = this
2402 .executor
2403 .run_with_thread_context(request, Arc::new(sink), thread_ctx)
2404 .await;
2405 let now = now_ms();
2406 let run_result = mailbox_run_result(&run_id, &dispatch_instance_id, &result);
2407 let record_result_start = Instant::now();
2408 let record_run_result = this
2409 .store
2410 .record_run_result(&dispatch_id, &claim_token, &run_result, now)
2411 .await;
2412 record_mailbox_operation_result(
2413 "record_run_result",
2414 result_label(&record_run_result),
2415 record_result_start,
2416 );
2417 if let Err(e) = record_run_result {
2418 tracing::warn!(dispatch_id, run_id, error = %e, "failed to record mailbox run result");
2419 }
2420
2421 let outcome = classify_error(&result);
2422 record_mailbox_dispatch_completion_metrics(
2423 &dispatch,
2424 start_now,
2425 now,
2426 outcome.metric_label(),
2427 );
2428
2429 match outcome {
2430 MailboxRunOutcome::Completed => {
2431 let ack_start = Instant::now();
2432 let ack_result = this.store.ack(&dispatch_id, &claim_token, now).await;
2433 record_mailbox_operation_result("ack", result_label(&ack_result), ack_start);
2434 if let Err(e) = ack_result {
2435 tracing::warn!(dispatch_id, error = %e, "ack failed");
2436 } else {
2437 this.refresh_dispatch_depth_metrics().await;
2438 }
2439 }
2440 MailboxRunOutcome::TransientError(msg) => {
2441 tracing::warn!(dispatch_id, error = %msg, "run failed (transient), nacking");
2442 let _ = event_tx
2445 .send(AgentEvent::RunFinish {
2446 thread_id: dispatch.thread_id.clone(),
2447 run_id: run_id.clone(),
2448 identity: Some(mailbox_run_identity(
2449 &dispatch,
2450 &run_id,
2451 &dispatch_instance_id,
2452 )),
2453 result: None,
2454 termination:
2455 awaken_contract::contract::lifecycle::TerminationReason::Error(
2456 msg.clone(),
2457 ),
2458 })
2459 .await;
2460 let backoff_factor = 2u64.pow(dispatch.attempt_count.saturating_sub(1).min(6));
2461 let retry_at = now
2462 + (this.config.default_retry_delay_ms * backoff_factor)
2463 .min(this.config.max_retry_delay_ms);
2464 let nack_start = Instant::now();
2465 let nack_result = this
2466 .store
2467 .nack(&dispatch_id, &claim_token, retry_at, &msg, now)
2468 .await;
2469 record_mailbox_operation_result("nack", result_label(&nack_result), nack_start);
2470 if let Err(e) = nack_result {
2471 tracing::warn!(dispatch_id, error = %e, "nack failed");
2472 } else {
2473 this.refresh_dispatch_depth_metrics().await;
2474 }
2475 }
2476 MailboxRunOutcome::PermanentError(msg) => {
2477 tracing::warn!(dispatch_id, error = %msg, "run failed (permanent), dead-lettering");
2478 let _ = event_tx
2482 .send(AgentEvent::RunFinish {
2483 thread_id: dispatch.thread_id.clone(),
2484 run_id: run_id.clone(),
2485 identity: Some(mailbox_run_identity(
2486 &dispatch,
2487 &run_id,
2488 &dispatch_instance_id,
2489 )),
2490 result: None,
2491 termination:
2492 awaken_contract::contract::lifecycle::TerminationReason::Error(
2493 msg.clone(),
2494 ),
2495 })
2496 .await;
2497 let dead_letter_start = Instant::now();
2498 let dead_letter_result = this
2499 .store
2500 .dead_letter(&dispatch_id, &claim_token, &msg, now)
2501 .await;
2502 record_mailbox_operation_result(
2503 "dead_letter",
2504 result_label(&dead_letter_result),
2505 dead_letter_start,
2506 );
2507 if let Err(e) = dead_letter_result {
2508 tracing::warn!(dispatch_id, error = %e, "dead_letter failed");
2509 } else {
2510 this.refresh_dispatch_depth_metrics().await;
2511 if let Ok(Some(dead_letter_dispatch)) =
2512 this.store.load_dispatch(&dispatch_id).await
2513 && dead_letter_dispatch.status == RunDispatchStatus::DeadLetter
2514 {
2515 this.mark_dead_letter_dispatch_run_error(&dead_letter_dispatch)
2516 .await;
2517 }
2518 }
2519 }
2520 }
2521
2522 this.finish_execution(&thread_id, &dispatch_id).await;
2523 });
2524 }
2525
2526 async fn finish_execution(self: &Arc<Self>, thread_id: &str, dispatch_id: &str) {
2527 let worker = self.get_or_create_worker(thread_id).await;
2529 {
2530 let mut w = worker.lock();
2531 let should_transition = matches!(
2532 &w.status,
2533 MailboxWorkerStatus::Running { dispatch_id: cid, .. } if cid == dispatch_id
2534 );
2535 if should_transition {
2536 let old = std::mem::replace(&mut w.status, MailboxWorkerStatus::Idle);
2538 w.thread_ctx = None;
2539 if let MailboxWorkerStatus::Running { lease_handle, .. } = old {
2540 lease_handle.abort();
2541 }
2542 }
2543 }
2544
2545 self.try_dispatch_next(thread_id).await;
2547 }
2548
2549 async fn get_or_create_worker(&self, thread_id: &str) -> Arc<SyncMutex<MailboxWorker>> {
2551 {
2553 let workers = self.workers.read().await;
2554 if let Some(w) = workers.get(thread_id) {
2555 return Arc::clone(w);
2556 }
2557 }
2558 let mut workers = self.workers.write().await;
2560 Arc::clone(
2561 workers
2562 .entry(thread_id.to_string())
2563 .or_insert_with(|| Arc::new(SyncMutex::new(MailboxWorker::default()))),
2564 )
2565 }
2566
2567 async fn prepare_run_for_dispatch(
2569 &self,
2570 request: &mut RunRequest,
2571 thread_id: &str,
2572 messages: &[Message],
2573 ) -> Result<String, MailboxError> {
2574 if request.continue_run_id.is_none()
2575 && request.run_id_hint.is_none()
2576 && let Some(waiting_run_id) = self.reusable_waiting_run_id(thread_id).await
2577 {
2578 request.continue_run_id = Some(waiting_run_id);
2579 }
2580
2581 let run_id = request
2582 .continue_run_id
2583 .clone()
2584 .or_else(|| request.run_id_hint.clone())
2585 .filter(|id| !id.trim().is_empty())
2586 .unwrap_or_else(|| uuid::Uuid::now_v7().to_string());
2587 if request.continue_run_id.is_none() {
2588 request.run_id_hint = Some(run_id.clone());
2589 }
2590
2591 let normalized_messages = normalize_message_ids(messages);
2592 let existing_messages = self
2593 .run_store
2594 .load_messages(thread_id)
2595 .await?
2596 .unwrap_or_default();
2597 let previous_run = self.run_store.latest_run(thread_id).await?;
2598 let mut appended_messages = existing_messages;
2599 appended_messages.extend(normalized_messages.iter().cloned());
2600 let input_message_ids = normalized_messages
2601 .iter()
2602 .filter_map(|message| message.id.clone())
2603 .collect::<Vec<_>>();
2604 let request_extras = RunRequestExtras::from_request(request)
2605 .to_value()
2606 .map_err(|e| {
2607 MailboxError::Internal(format!("failed to serialize request extras: {e}"))
2608 })?;
2609 let request_snapshot = RunRequestSnapshot {
2610 origin: request.origin,
2611 sender_id: None,
2612 input_message_ids: input_message_ids.clone(),
2613 input_message_count: normalized_messages.len() as u64,
2614 request_extras,
2615 decisions: request
2616 .decisions
2617 .iter()
2618 .map(|(call_id, resume)| RunResumeDecision {
2619 call_id: call_id.clone(),
2620 resume: resume.clone(),
2621 })
2622 .collect(),
2623 frontend_tools: request.frontend_tools.clone(),
2624 parent_thread_id: request.parent_thread_id.clone(),
2625 transport_request_id: request.transport_request_id.clone(),
2626 };
2627 let input = Some(RunMessageInput {
2628 thread_id: thread_id.to_string(),
2629 range: MessageSeqRange::new(1, appended_messages.len() as u64),
2630 trigger_message_ids: input_message_ids,
2631 selected_message_ids: Vec::new(),
2632 context_policy: None,
2633 compacted_snapshot_id: None,
2634 });
2635
2636 let existing_run = self.run_store.load_run(&run_id).await?;
2637 if let Some(mut existing) = existing_run {
2638 if existing.thread_id != thread_id {
2639 return Err(MailboxError::Validation(format!(
2640 "run_id '{run_id}' belongs to thread '{}', not '{thread_id}'",
2641 existing.thread_id
2642 )));
2643 }
2644 if existing.status != RunStatus::Created && !existing.is_resumable_waiting() {
2645 return Err(MailboxError::Validation(format!(
2646 "run_id '{run_id}' is not open for dispatch"
2647 )));
2648 }
2649 existing.request = Some(request_snapshot);
2650 existing.input = input;
2651 existing.updated_at = now_ms() / 1000;
2652 self.run_store
2653 .checkpoint(thread_id, &appended_messages, &existing)
2654 .await?;
2655 {
2656 let workers = self.workers.read().await;
2657 if let Some(worker) = workers.get(thread_id) {
2658 let mut w = worker.lock();
2659 if let Some(ref mut ctx) = w.thread_ctx {
2660 ctx.apply_checkpoint(&appended_messages, &existing);
2661 }
2662 }
2663 }
2664 return Ok(run_id);
2665 }
2666
2667 let inferred_agent_id = request
2668 .agent_id
2669 .clone()
2670 .or_else(|| {
2671 previous_run.as_ref().and_then(|run| {
2672 (run.status != RunStatus::Created && !run.agent_id.trim().is_empty())
2673 .then(|| run.agent_id.clone())
2674 })
2675 })
2676 .unwrap_or_else(|| "default".to_string());
2677 let inherited_state = previous_run
2678 .as_ref()
2679 .filter(|run| run.status != RunStatus::Created)
2680 .and_then(|run| run.state.clone());
2681 let now = now_ms() / 1000;
2682 let record = RunRecord {
2683 run_id: run_id.clone(),
2684 thread_id: thread_id.to_string(),
2685 agent_id: inferred_agent_id,
2686 parent_run_id: request.parent_run_id.clone(),
2687 request: Some(request_snapshot),
2688 input,
2689 output: None,
2690 status: RunStatus::Created,
2691 termination_reason: None,
2692 final_output: None,
2693 error_payload: None,
2694 dispatch_id: None,
2695 session_id: None,
2696 transport_request_id: request.transport_request_id.clone(),
2697 waiting: None,
2698 outcome: None,
2699 created_at: now,
2700 started_at: None,
2701 finished_at: None,
2702 updated_at: now,
2703 steps: 0,
2704 input_tokens: 0,
2705 output_tokens: 0,
2706 state: inherited_state,
2707 };
2708 self.run_store
2709 .checkpoint(thread_id, &appended_messages, &record)
2710 .await?;
2711 {
2712 let workers = self.workers.read().await;
2713 if let Some(worker) = workers.get(thread_id) {
2714 let mut w = worker.lock();
2715 if let Some(ref mut ctx) = w.thread_ctx {
2716 ctx.apply_checkpoint(&appended_messages, &record);
2717 }
2718 }
2719 }
2720 Ok(run_id)
2721 }
2722
2723 fn build_dispatch(
2725 &self,
2726 request: &RunRequest,
2727 thread_id: &str,
2728 ) -> Result<RunDispatch, MailboxError> {
2729 let run_id = request
2730 .continue_run_id
2731 .clone()
2732 .or_else(|| request.run_id_hint.clone())
2733 .ok_or_else(|| MailboxError::Internal("run_id missing after preparation".into()))?;
2734 let now = now_ms();
2735 Ok(RunDispatch {
2736 dispatch_id: request
2737 .dispatch_id_hint
2738 .clone()
2739 .unwrap_or_else(|| uuid::Uuid::now_v7().to_string()),
2740 thread_id: thread_id.to_string(),
2741 run_id,
2742 priority: 128,
2743 dedupe_key: None,
2744 dispatch_epoch: 0,
2745 status: RunDispatchStatus::Queued,
2746 available_at: now,
2747 attempt_count: 0,
2748 max_attempts: self.config.default_max_attempts,
2749 last_error: None,
2750 claim_token: None,
2751 claimed_by: None,
2752 lease_until: None,
2753 dispatch_instance_id: None,
2754 run_status: None,
2755 termination: None,
2756 run_response: None,
2757 run_error: None,
2758 completed_at: None,
2759 created_at: now,
2760 updated_at: now,
2761 })
2762 }
2763
2764 async fn reconstruct_run_request(
2765 &self,
2766 dispatch: &RunDispatch,
2767 ) -> Result<RunRequest, MailboxError> {
2768 let run = {
2769 let cached = {
2770 let workers = self.workers.read().await;
2771 workers.get(&dispatch.thread_id).and_then(|w| {
2772 let w = w.lock();
2773 w.thread_ctx
2774 .as_ref()
2775 .and_then(|ctx| ctx.get_run(&dispatch.run_id).cloned())
2776 })
2777 };
2778 if let Some(run) = cached {
2779 run
2780 } else {
2781 self.run_store
2782 .load_run(&dispatch.run_id)
2783 .await?
2784 .ok_or_else(|| {
2785 MailboxError::Validation(format!(
2786 "run '{}' not found for dispatch '{}'",
2787 dispatch.run_id, dispatch.dispatch_id
2788 ))
2789 })?
2790 }
2791 };
2792 if run.thread_id != dispatch.thread_id {
2793 return Err(MailboxError::Validation(format!(
2794 "run '{}' belongs to thread '{}', not dispatch thread '{}'",
2795 run.run_id, run.thread_id, dispatch.thread_id
2796 )));
2797 }
2798 let snapshot = run.request.clone().ok_or_else(|| {
2799 MailboxError::Validation(format!("run '{}' has no request snapshot", run.run_id))
2800 })?;
2801 let activation_messages = self.activation_messages_for_run(&run, &snapshot).await?;
2802 let mut request = RunRequest::new(run.thread_id.clone(), activation_messages)
2803 .with_messages_already_persisted(true)
2804 .with_origin(snapshot.origin)
2805 .with_run_mode(RunMode::Resume)
2806 .with_adapter(AdapterKind::Internal);
2807 if !run.agent_id.trim().is_empty() {
2808 request = request.with_agent_id(run.agent_id.clone());
2809 }
2810 if let Some(parent_run_id) = run.parent_run_id.clone() {
2811 request = request.with_parent_run_id(parent_run_id);
2812 }
2813 if let Some(parent_thread_id) = snapshot.parent_thread_id.clone() {
2814 request = request.with_parent_thread_id(parent_thread_id);
2815 }
2816 if let Some(transport_request_id) = snapshot.transport_request_id.clone() {
2817 request = request.with_transport_request_id(transport_request_id);
2818 }
2819 if !snapshot.decisions.is_empty() {
2820 request = request.with_decisions(
2821 snapshot
2822 .decisions
2823 .iter()
2824 .map(|decision| (decision.call_id.clone(), decision.resume.clone()))
2825 .collect(),
2826 );
2827 }
2828 if !snapshot.frontend_tools.is_empty() {
2829 request = request.with_frontend_tools(snapshot.frontend_tools.clone());
2830 }
2831 if let Some(extras_value) = snapshot.request_extras.as_ref() {
2832 let extras = RunRequestExtras::from_value(extras_value).map_err(|error| {
2833 MailboxError::Validation(format!("corrupt request_extras: {error}"))
2834 })?;
2835 request = extras.apply_to(request);
2836 }
2837 request = if run.is_resumable_waiting() {
2838 request.with_continue_run_id(run.run_id.clone())
2839 } else {
2840 request.with_run_id_hint(run.run_id.clone())
2841 };
2842 Ok(request.with_trace_dispatch_id(dispatch.dispatch_id.clone()))
2843 }
2844
2845 async fn activation_messages_for_run(
2846 &self,
2847 run: &RunRecord,
2848 snapshot: &RunRequestSnapshot,
2849 ) -> Result<Vec<Message>, MailboxError> {
2850 if snapshot.input_message_ids.is_empty() {
2851 return self.activation_messages_from_range(run, snapshot).await;
2852 }
2853 let cached_messages: Option<Vec<Message>> = {
2855 let workers = self.workers.read().await;
2856 workers.get(&run.thread_id).and_then(|w| {
2857 let w = w.lock();
2858 w.thread_ctx.as_ref().and_then(|ctx| {
2859 let mut msgs = Vec::with_capacity(snapshot.input_message_ids.len());
2860 for msg_id in &snapshot.input_message_ids {
2861 let found = ctx
2862 .messages
2863 .iter()
2864 .find(|m| m.id.as_deref() == Some(msg_id.as_str()));
2865 msgs.push(found?.clone());
2866 }
2867 Some(msgs)
2868 })
2869 })
2870 };
2871 if let Some(msgs) = cached_messages {
2872 return Ok(msgs);
2873 }
2874 let mut messages = Vec::with_capacity(snapshot.input_message_ids.len());
2875 for message_id in &snapshot.input_message_ids {
2876 let record = self
2877 .run_store
2878 .load_message_record(&run.thread_id, message_id)
2879 .await?
2880 .ok_or_else(|| {
2881 MailboxError::Validation(format!(
2882 "message '{message_id}' not found for run '{}'",
2883 run.run_id
2884 ))
2885 })?;
2886 messages.push(record.message);
2887 }
2888 Ok(messages)
2889 }
2890
2891 async fn activation_messages_from_range(
2892 &self,
2893 run: &RunRecord,
2894 snapshot: &RunRequestSnapshot,
2895 ) -> Result<Vec<Message>, MailboxError> {
2896 let Some(input) = run.input.as_ref() else {
2897 return Ok(Vec::new());
2898 };
2899 let Some(range) = input.range else {
2900 return Ok(Vec::new());
2901 };
2902 let count = snapshot.input_message_count;
2903 if count == 0 {
2904 return Ok(Vec::new());
2905 }
2906 let from_seq = range.to_seq.saturating_sub(count).saturating_add(1);
2907 let Some(range) = MessageSeqRange::new(from_seq.max(range.from_seq), range.to_seq) else {
2908 return Ok(Vec::new());
2909 };
2910 let records = self
2911 .run_store
2912 .load_message_records_range(&run.thread_id, range)
2913 .await?;
2914 Ok(records.into_iter().map(|record| record.message).collect())
2915 }
2916
2917 async fn run_sweep(self: &Arc<Self>) {
2920 let now = now_ms();
2921 let reclaim_start = Instant::now();
2922 let reclaim_result = self.store.reclaim_expired_leases(now, 100).await;
2923 record_mailbox_operation_result("reclaim", result_label(&reclaim_result), reclaim_start);
2924 match reclaim_result {
2925 Ok(reclaimed) => {
2926 crate::metrics::inc_mailbox_operation_by(
2927 "reclaim_dispatch",
2928 "ok",
2929 reclaimed.len() as u64,
2930 );
2931 if !reclaimed.is_empty() {
2932 tracing::info!(count = reclaimed.len(), "sweep reclaimed expired leases");
2933 self.refresh_dispatch_depth_metrics().await;
2934 for dispatch in reclaimed {
2935 self.reconcile_terminal_dispatch(&dispatch).await;
2936 if dispatch.status == RunDispatchStatus::Queued {
2937 let thread_id = dispatch.thread_id.clone();
2938 self.get_or_create_worker(&thread_id).await;
2939 self.try_dispatch_next(&thread_id).await;
2940 }
2941 }
2942 }
2943 self.reconcile_terminal_dispatches().await;
2944 }
2945 Err(e) => {
2946 tracing::warn!(error = %e, "sweep failed");
2947 }
2948 }
2949 }
2950
2951 async fn run_gc(&self) {
2952 let now = now_ms();
2953 let gc_ttl_ms = self.config.gc_ttl.as_millis() as u64;
2954 let older_than = now.saturating_sub(gc_ttl_ms);
2955 let purge_start = Instant::now();
2956 let purge_result = self.store.purge_terminal(older_than).await;
2957 record_mailbox_operation_result("purge_terminal", result_label(&purge_result), purge_start);
2958 match purge_result {
2959 Ok(purged) => {
2960 crate::metrics::inc_mailbox_operation_by("purged", "ok", purged as u64);
2961 if purged > 0 {
2962 tracing::info!(purged, "GC purged terminal dispatches");
2963 self.refresh_dispatch_depth_metrics().await;
2964 }
2965 }
2966 Err(e) => {
2967 tracing::warn!(error = %e, "GC failed");
2968 }
2969 }
2970
2971 self.gc_idle_workers().await;
2973 }
2974
2975 async fn gc_idle_workers(&self) {
2980 let idle_keys: Vec<String> = {
2981 let workers = self.workers.read().await;
2982 let mut keys = Vec::new();
2983 for (thread_id, worker) in workers.iter() {
2984 let w = worker.lock();
2985 if matches!(w.status, MailboxWorkerStatus::Idle) {
2986 keys.push(thread_id.clone());
2987 }
2988 }
2989 keys
2990 };
2991
2992 if idle_keys.is_empty() {
2993 return;
2994 }
2995
2996 let mut removable = Vec::new();
3000 for thread_id in &idle_keys {
3001 let has_queued = self
3002 .store
3003 .list_dispatches(
3004 thread_id,
3005 Some(&[RunDispatchStatus::Queued, RunDispatchStatus::Claimed]),
3006 1,
3007 0,
3008 )
3009 .await
3010 .map(|dispatches| !dispatches.is_empty())
3011 .unwrap_or(true); if !has_queued {
3014 removable.push(thread_id.clone());
3015 }
3016 }
3017
3018 if removable.is_empty() {
3019 return;
3020 }
3021
3022 let mut removed = 0usize;
3023 let mut workers = self.workers.write().await;
3024 for thread_id in removable {
3025 let still_idle = if let Some(worker) = workers.get(&thread_id) {
3028 let w = worker.lock();
3029 matches!(w.status, MailboxWorkerStatus::Idle)
3030 } else {
3031 false
3032 };
3033 if still_idle {
3034 workers.remove(&thread_id);
3035 removed += 1;
3036 }
3037 }
3038
3039 if removed > 0 {
3040 tracing::debug!(removed, "GC removed idle workers");
3041 }
3042 }
3043}
3044
3045async fn revert_claiming_to_idle(
3048 workers: &RwLock<HashMap<String, Arc<SyncMutex<MailboxWorker>>>>,
3049 thread_id: &str,
3050) {
3051 let workers = workers.read().await;
3052 if let Some(worker) = workers.get(thread_id) {
3053 let mut w = worker.lock();
3054 if matches!(w.status, MailboxWorkerStatus::Claiming) {
3055 w.status = MailboxWorkerStatus::Idle;
3056 }
3057 }
3058}
3059
3060fn normalize_mailbox_run_mode(request: &mut RunRequest, background: bool) {
3063 if request.run_mode != RunMode::Foreground {
3064 return;
3065 }
3066
3067 request.run_mode = if !request.decisions.is_empty() || request.continue_run_id.is_some() {
3068 RunMode::Resume
3069 } else if matches!(
3070 request.origin,
3071 awaken_contract::contract::storage::RunRequestOrigin::Internal
3072 ) {
3073 RunMode::InternalWake
3074 } else if background {
3075 RunMode::Scheduled
3076 } else {
3077 RunMode::Foreground
3078 };
3079}
3080
3081fn validate_run_inputs(
3087 thread_id: String,
3088 messages: Vec<Message>,
3089 allow_empty_messages: bool,
3090) -> Result<(String, Vec<Message>), MailboxError> {
3091 if messages.is_empty() && !allow_empty_messages {
3092 return Err(MailboxError::Validation(
3093 "at least one message is required".to_string(),
3094 ));
3095 }
3096 let thread_id = {
3097 let trimmed = thread_id.trim().to_string();
3098 if trimmed.is_empty() {
3099 uuid::Uuid::now_v7().to_string()
3100 } else {
3101 trimmed
3102 }
3103 };
3104 Ok((thread_id, messages))
3105}
3106
3107fn normalize_message_ids(messages: &[Message]) -> Vec<Message> {
3108 messages
3109 .iter()
3110 .cloned()
3111 .map(|mut message| {
3112 if message.id.as_deref().map(str::is_empty).unwrap_or(true) {
3113 message.id = Some(awaken_contract::contract::message::gen_message_id());
3114 }
3115 message
3116 })
3117 .collect()
3118}
3119
3120fn live_target_for_dispatch(dispatch: &RunDispatch) -> LiveRunTarget {
3121 LiveRunTarget::new(dispatch.thread_id.clone(), dispatch.run_id.clone())
3122 .with_dispatch_id(dispatch.dispatch_id.clone())
3123}
3124
3125fn live_target_for_run(run: &RunRecord) -> LiveRunTarget {
3126 let mut target = LiveRunTarget::new(run.thread_id.clone(), run.run_id.clone());
3127 if let Some(dispatch_id) = run.dispatch_id.clone() {
3128 target = target.with_dispatch_id(dispatch_id);
3129 }
3130 target
3131}
3132
3133fn mailbox_run_result(
3134 run_id: &str,
3135 dispatch_instance_id: &str,
3136 result: &Result<
3137 awaken_runtime::loop_runner::AgentRunResult,
3138 awaken_runtime::loop_runner::AgentLoopError,
3139 >,
3140) -> RunDispatchResult {
3141 use awaken_contract::contract::lifecycle::{RunStatus, TerminationReason};
3142
3143 match result {
3144 Ok(run) => {
3145 let (status, _) = run.termination.to_run_status();
3146 RunDispatchResult {
3147 run_id: run.run_id.clone(),
3148 dispatch_instance_id: dispatch_instance_id.to_string(),
3149 status,
3150 termination: Some(run.termination.clone()),
3151 response: (!run.response.is_empty()).then(|| run.response.clone()),
3152 error: match &run.termination {
3153 TerminationReason::Error(message) => Some(message.clone()),
3154 _ => None,
3155 },
3156 }
3157 }
3158 Err(error) => RunDispatchResult {
3159 run_id: run_id.to_string(),
3160 dispatch_instance_id: dispatch_instance_id.to_string(),
3161 status: RunStatus::Done,
3162 termination: Some(TerminationReason::Error(error.to_string())),
3163 response: None,
3164 error: Some(error.to_string()),
3165 },
3166 }
3167}
3168
3169fn mailbox_run_identity(
3170 dispatch: &RunDispatch,
3171 run_id: &str,
3172 dispatch_instance_id: &str,
3173) -> awaken_contract::contract::identity::RunIdentity {
3174 awaken_contract::contract::identity::RunIdentity::new(
3175 dispatch.thread_id.clone(),
3176 None,
3177 run_id.to_string(),
3178 None,
3179 String::new(),
3180 awaken_contract::contract::identity::RunOrigin::Internal,
3181 )
3182 .with_dispatch_id(dispatch.dispatch_id.clone())
3183 .with_session_id(dispatch_instance_id.to_string())
3184}
3185
3186fn millis_to_seconds(ms: u64) -> f64 {
3187 ms as f64 / 1_000.0
3188}
3189
3190fn record_mailbox_dispatch_start_metrics(dispatch: &RunDispatch, start_now: u64) {
3191 let enqueue_to_start_ms = start_now.saturating_sub(dispatch.created_at);
3192 let eligible_to_start_ms = start_now.saturating_sub(dispatch.available_at);
3193 let claim_to_start_ms = start_now.saturating_sub(dispatch.updated_at);
3194
3195 crate::metrics::record_mailbox_dispatch_enqueue_to_start(millis_to_seconds(
3196 enqueue_to_start_ms,
3197 ));
3198 crate::metrics::record_mailbox_dispatch_eligible_to_start(millis_to_seconds(
3199 eligible_to_start_ms,
3200 ));
3201 crate::metrics::record_mailbox_dispatch_claim_to_start(millis_to_seconds(claim_to_start_ms));
3202
3203 tracing::info!(
3204 dispatch_id = %dispatch.dispatch_id,
3205 run_id = %dispatch.run_id,
3206 thread_id = %dispatch.thread_id,
3207 enqueue_to_start_ms,
3208 eligible_to_start_ms,
3209 claim_to_start_ms,
3210 "mailbox dispatch processing started"
3211 );
3212}
3213
3214fn record_mailbox_dispatch_completion_metrics(
3215 dispatch: &RunDispatch,
3216 start_now: u64,
3217 completed_now: u64,
3218 outcome: &str,
3219) {
3220 let runtime_ms = completed_now.saturating_sub(start_now);
3221 let enqueue_to_complete_ms = completed_now.saturating_sub(dispatch.created_at);
3222
3223 crate::metrics::record_mailbox_dispatch_runtime(millis_to_seconds(runtime_ms), outcome);
3224 crate::metrics::record_mailbox_dispatch_enqueue_to_complete(
3225 millis_to_seconds(enqueue_to_complete_ms),
3226 outcome,
3227 );
3228 crate::metrics::record_run_completion(millis_to_seconds(runtime_ms), outcome);
3229
3230 tracing::info!(
3231 dispatch_id = %dispatch.dispatch_id,
3232 run_id = %dispatch.run_id,
3233 thread_id = %dispatch.thread_id,
3234 outcome,
3235 runtime_ms,
3236 enqueue_to_complete_ms,
3237 "mailbox dispatch processing completed"
3238 );
3239}
3240
3241fn record_mailbox_dispatch_terminal_metrics(dispatch: &RunDispatch, outcome: &str) {
3242 let completed_now = dispatch.completed_at.unwrap_or_else(now_ms);
3243 record_mailbox_dispatch_completion_metrics(dispatch, completed_now, completed_now, outcome);
3244}
3245
3246fn record_mailbox_operation_result(operation: &str, result: &str, start: Instant) {
3247 crate::metrics::record_mailbox_operation(operation, result, start.elapsed().as_secs_f64());
3248}
3249
3250fn dispatch_signal_blocked_nack_delay(redelivery_attempts: Option<u64>) -> Duration {
3251 let exponent = redelivery_attempts.unwrap_or(1).saturating_sub(1).min(16);
3252 let multiplier = 1u32.checked_shl(exponent as u32).unwrap_or(u32::MAX);
3253 dispatch_signal_nack_base_delay()
3254 .saturating_mul(multiplier)
3255 .min(dispatch_signal_nack_max_delay())
3256}
3257
3258fn dispatch_signal_batch_size() -> usize {
3259 env_usize(DISPATCH_SIGNAL_BATCH_ENV, DISPATCH_SIGNAL_BATCH_DEFAULT)
3260}
3261
3262fn dispatch_signal_fetch_expires() -> Duration {
3263 env_duration_ms(DISPATCH_SIGNAL_EXPIRES_ENV, DISPATCH_SIGNAL_EXPIRES_DEFAULT)
3264}
3265
3266fn dispatch_signal_nack_base_delay() -> Duration {
3267 env_duration_ms(
3268 DISPATCH_SIGNAL_NACK_BASE_DELAY_ENV,
3269 DISPATCH_SIGNAL_BLOCKED_NACK_BASE_DELAY_DEFAULT,
3270 )
3271}
3272
3273fn dispatch_signal_nack_max_delay() -> Duration {
3274 env_duration_ms(
3275 DISPATCH_SIGNAL_NACK_MAX_DELAY_ENV,
3276 DISPATCH_SIGNAL_BLOCKED_NACK_MAX_DELAY_DEFAULT,
3277 )
3278}
3279
3280fn dispatch_signal_max_concurrent_handlers() -> usize {
3281 env_usize(
3282 DISPATCH_SIGNAL_MAX_CONCURRENT_HANDLERS_ENV,
3283 DISPATCH_SIGNAL_MAX_CONCURRENT_HANDLERS_DEFAULT,
3284 )
3285}
3286
3287fn env_usize(name: &str, default: usize) -> usize {
3288 std::env::var(name)
3289 .ok()
3290 .and_then(|value| value.parse::<usize>().ok())
3291 .filter(|value| *value > 0)
3292 .unwrap_or(default)
3293}
3294
3295fn env_duration_ms(name: &str, default: Duration) -> Duration {
3296 std::env::var(name)
3297 .ok()
3298 .and_then(|value| value.parse::<u64>().ok())
3299 .filter(|value| *value > 0)
3300 .map(Duration::from_millis)
3301 .unwrap_or(default)
3302}
3303
3304fn result_label<T, E>(result: &Result<T, E>) -> &'static str {
3305 if result.is_ok() { "ok" } else { "error" }
3306}
3307
3308fn dispatch_status_label(status: RunDispatchStatus) -> &'static str {
3309 match status {
3310 RunDispatchStatus::Queued => "queued",
3311 RunDispatchStatus::Claimed => "claimed",
3312 RunDispatchStatus::Acked => "acked",
3313 RunDispatchStatus::Cancelled => "cancelled",
3314 RunDispatchStatus::Superseded => "superseded",
3315 RunDispatchStatus::DeadLetter => "dead_letter",
3316 }
3317}
3318
3319fn classify_error(
3321 result: &Result<
3322 awaken_runtime::loop_runner::AgentRunResult,
3323 awaken_runtime::loop_runner::AgentLoopError,
3324 >,
3325) -> MailboxRunOutcome {
3326 match result {
3327 Ok(_) => MailboxRunOutcome::Completed,
3328 Err(e) => {
3329 use awaken_runtime::loop_runner::AgentLoopError;
3330 match e {
3331 AgentLoopError::RuntimeError(re) => {
3332 use awaken_runtime::RuntimeError;
3333 match re {
3334 RuntimeError::ThreadAlreadyRunning { .. } => {
3335 MailboxRunOutcome::PermanentError(e.to_string())
3338 }
3339 RuntimeError::AgentNotFound { .. } | RuntimeError::ResolveFailed { .. } => {
3340 MailboxRunOutcome::PermanentError(e.to_string())
3341 }
3342 _ => MailboxRunOutcome::TransientError(e.to_string()),
3343 }
3344 }
3345 AgentLoopError::StorageError(_) => MailboxRunOutcome::TransientError(e.to_string()),
3346 AgentLoopError::InferenceFailed(_) => {
3347 MailboxRunOutcome::TransientError(e.to_string())
3348 }
3349 _ => MailboxRunOutcome::Completed,
3351 }
3352 }
3353 }
3354}
3355
3356#[cfg(test)]
3359mod tests {
3360 use super::*;
3361 use async_trait::async_trait;
3362 use awaken_contract::contract::content::ContentBlock;
3363 use awaken_contract::contract::executor::{
3364 InferenceExecutionError, InferenceRequest, LlmExecutor,
3365 };
3366 use awaken_contract::contract::inference::{StopReason, StreamResult};
3367 use awaken_contract::contract::lifecycle::{RunStatus, TerminationReason};
3368 use awaken_contract::contract::message::{Message, ToolCall};
3369 use awaken_contract::contract::storage::RunRequestOrigin;
3370 use awaken_contract::contract::storage::{
3371 RunRecord, RunStore, RunWaitingState, ThreadRunStore, ThreadStore, WaitingReason,
3372 };
3373 use awaken_contract::contract::tool::{
3374 Tool, ToolCallContext, ToolDescriptor, ToolError, ToolOutput, ToolResult,
3375 };
3376 use awaken_contract::thread::Thread;
3377 use awaken_runtime::extensions::background::{
3378 BackgroundTaskManager, BackgroundTaskPlugin, TaskParentContext,
3379 TaskResult as BackgroundTaskResult,
3380 };
3381 use awaken_runtime::loop_runner::build_agent_env;
3382 use awaken_runtime::{Plugin, ResolvedAgent};
3383 use awaken_stores::{InMemoryMailboxStore, InMemoryStore};
3384 use serde_json::{Value, json};
3385 use std::collections::VecDeque;
3386 use std::sync::Mutex as StdMutex;
3387 use std::sync::atomic::AtomicUsize;
3388 use tokio::time::{Duration, Instant, sleep};
3389
3390 struct StubResolver;
3394 impl awaken_runtime::AgentResolver for StubResolver {
3395 fn resolve(
3396 &self,
3397 agent_id: &str,
3398 ) -> Result<awaken_runtime::ResolvedAgent, awaken_runtime::RuntimeError> {
3399 Err(awaken_runtime::RuntimeError::AgentNotFound {
3400 agent_id: agent_id.to_string(),
3401 })
3402 }
3403 }
3404
3405 fn make_store() -> Arc<InMemoryMailboxStore> {
3406 Arc::new(InMemoryMailboxStore::new())
3407 }
3408
3409 fn make_resume() -> ToolCallResume {
3410 ToolCallResume {
3411 decision_id: "d1".into(),
3412 action: awaken_contract::contract::suspension::ResumeDecisionAction::Resume,
3413 result: serde_json::json!({"approved": true}),
3414 reason: None,
3415 updated_at: 0,
3416 }
3417 }
3418
3419 struct RecoverFlakyMailboxStore {
3420 inner: InMemoryMailboxStore,
3421 reclaim_failures_remaining: AtomicUsize,
3422 reclaim_calls: AtomicUsize,
3423 }
3424
3425 impl RecoverFlakyMailboxStore {
3426 fn new(reclaim_failures: usize) -> Self {
3427 Self {
3428 inner: InMemoryMailboxStore::new(),
3429 reclaim_failures_remaining: AtomicUsize::new(reclaim_failures),
3430 reclaim_calls: AtomicUsize::new(0),
3431 }
3432 }
3433
3434 fn reclaim_calls(&self) -> usize {
3435 self.reclaim_calls.load(Ordering::SeqCst)
3436 }
3437 }
3438
3439 #[async_trait::async_trait]
3440 impl MailboxStore for RecoverFlakyMailboxStore {
3441 async fn enqueue(&self, dispatch: &RunDispatch) -> Result<(), StorageError> {
3442 self.inner.enqueue(dispatch).await
3443 }
3444
3445 async fn claim(
3446 &self,
3447 thread_id: &str,
3448 consumer_id: &str,
3449 lease_ms: u64,
3450 now: u64,
3451 limit: usize,
3452 ) -> Result<Vec<RunDispatch>, StorageError> {
3453 self.inner
3454 .claim(thread_id, consumer_id, lease_ms, now, limit)
3455 .await
3456 }
3457
3458 async fn claim_dispatch(
3459 &self,
3460 dispatch_id: &str,
3461 consumer_id: &str,
3462 lease_ms: u64,
3463 now: u64,
3464 ) -> Result<Option<RunDispatch>, StorageError> {
3465 self.inner
3466 .claim_dispatch(dispatch_id, consumer_id, lease_ms, now)
3467 .await
3468 }
3469
3470 async fn ack(
3471 &self,
3472 dispatch_id: &str,
3473 claim_token: &str,
3474 now: u64,
3475 ) -> Result<(), StorageError> {
3476 self.inner.ack(dispatch_id, claim_token, now).await
3477 }
3478
3479 async fn record_dispatch_start(
3480 &self,
3481 dispatch_id: &str,
3482 claim_token: &str,
3483 dispatch_instance_id: &str,
3484 now: u64,
3485 ) -> Result<(), StorageError> {
3486 self.inner
3487 .record_dispatch_start(dispatch_id, claim_token, dispatch_instance_id, now)
3488 .await
3489 }
3490
3491 async fn record_run_result(
3492 &self,
3493 dispatch_id: &str,
3494 claim_token: &str,
3495 result: &RunDispatchResult,
3496 now: u64,
3497 ) -> Result<(), StorageError> {
3498 self.inner
3499 .record_run_result(dispatch_id, claim_token, result, now)
3500 .await
3501 }
3502
3503 async fn nack(
3504 &self,
3505 dispatch_id: &str,
3506 claim_token: &str,
3507 retry_at: u64,
3508 error: &str,
3509 now: u64,
3510 ) -> Result<(), StorageError> {
3511 self.inner
3512 .nack(dispatch_id, claim_token, retry_at, error, now)
3513 .await
3514 }
3515
3516 async fn dead_letter(
3517 &self,
3518 dispatch_id: &str,
3519 claim_token: &str,
3520 error: &str,
3521 now: u64,
3522 ) -> Result<(), StorageError> {
3523 self.inner
3524 .dead_letter(dispatch_id, claim_token, error, now)
3525 .await
3526 }
3527
3528 async fn cancel(
3529 &self,
3530 dispatch_id: &str,
3531 now: u64,
3532 ) -> Result<Option<RunDispatch>, StorageError> {
3533 self.inner.cancel(dispatch_id, now).await
3534 }
3535
3536 async fn extend_lease(
3537 &self,
3538 dispatch_id: &str,
3539 claim_token: &str,
3540 extension_ms: u64,
3541 now: u64,
3542 ) -> Result<bool, StorageError> {
3543 self.inner
3544 .extend_lease(dispatch_id, claim_token, extension_ms, now)
3545 .await
3546 }
3547
3548 async fn interrupt(
3549 &self,
3550 thread_id: &str,
3551 now: u64,
3552 ) -> Result<MailboxInterrupt, StorageError> {
3553 self.inner.interrupt(thread_id, now).await
3554 }
3555
3556 async fn interrupt_detailed(
3557 &self,
3558 thread_id: &str,
3559 now: u64,
3560 ) -> Result<MailboxInterruptDetails, StorageError> {
3561 self.inner.interrupt_detailed(thread_id, now).await
3562 }
3563
3564 async fn current_dispatch_epoch(&self, thread_id: &str) -> Result<u64, StorageError> {
3565 self.inner.current_dispatch_epoch(thread_id).await
3566 }
3567
3568 async fn supersede_claimed(
3569 &self,
3570 dispatch_id: &str,
3571 claim_token: &str,
3572 now: u64,
3573 reason: &str,
3574 ) -> Result<Option<RunDispatch>, StorageError> {
3575 self.inner
3576 .supersede_claimed(dispatch_id, claim_token, now, reason)
3577 .await
3578 }
3579
3580 async fn load_dispatch(
3581 &self,
3582 dispatch_id: &str,
3583 ) -> Result<Option<RunDispatch>, StorageError> {
3584 self.inner.load_dispatch(dispatch_id).await
3585 }
3586
3587 async fn list_dispatches(
3588 &self,
3589 thread_id: &str,
3590 status_filter: Option<&[RunDispatchStatus]>,
3591 limit: usize,
3592 offset: usize,
3593 ) -> Result<Vec<RunDispatch>, StorageError> {
3594 self.inner
3595 .list_dispatches(thread_id, status_filter, limit, offset)
3596 .await
3597 }
3598
3599 async fn list_terminal_dispatches(
3600 &self,
3601 limit: usize,
3602 offset: usize,
3603 ) -> Result<Vec<RunDispatch>, StorageError> {
3604 self.inner.list_terminal_dispatches(limit, offset).await
3605 }
3606
3607 async fn reclaim_expired_leases(
3608 &self,
3609 now: u64,
3610 limit: usize,
3611 ) -> Result<Vec<RunDispatch>, StorageError> {
3612 self.reclaim_calls.fetch_add(1, Ordering::SeqCst);
3613 let remaining = self.reclaim_failures_remaining.load(Ordering::SeqCst);
3614 if remaining > 0
3615 && self
3616 .reclaim_failures_remaining
3617 .compare_exchange(remaining, remaining - 1, Ordering::SeqCst, Ordering::SeqCst)
3618 .is_ok()
3619 {
3620 return Err(StorageError::Io("injected startup recovery failure".into()));
3621 }
3622 self.inner.reclaim_expired_leases(now, limit).await
3623 }
3624
3625 async fn purge_terminal(&self, older_than: u64) -> Result<usize, StorageError> {
3626 self.inner.purge_terminal(older_than).await
3627 }
3628
3629 async fn queued_thread_ids(&self) -> Result<Vec<String>, StorageError> {
3630 self.inner.queued_thread_ids().await
3631 }
3632 }
3633
3634 #[derive(Clone)]
3635 struct TestDispatchSignal {
3636 thread_id: String,
3637 dispatch_id: String,
3638 }
3639
3640 struct TestDispatchSignalReceipt {
3641 signal: TestDispatchSignal,
3642 queue: Arc<tokio::sync::Mutex<VecDeque<TestDispatchSignal>>>,
3643 acked_count: Arc<AtomicUsize>,
3644 nacked_count: Arc<AtomicUsize>,
3645 }
3646
3647 #[async_trait::async_trait]
3648 impl awaken_contract::contract::mailbox::DispatchSignalReceipt for TestDispatchSignalReceipt {
3649 async fn ack(self: Box<Self>) -> Result<(), StorageError> {
3650 self.acked_count.fetch_add(1, Ordering::SeqCst);
3651 Ok(())
3652 }
3653
3654 async fn nack(self: Box<Self>) -> Result<(), StorageError> {
3655 self.nacked_count.fetch_add(1, Ordering::SeqCst);
3656 self.queue.lock().await.push_back(self.signal.clone());
3657 Ok(())
3658 }
3659 }
3660
3661 struct SignalMailboxStore {
3662 inner: InMemoryMailboxStore,
3663 signals: Arc<tokio::sync::Mutex<VecDeque<TestDispatchSignal>>>,
3664 acked_count: Arc<AtomicUsize>,
3665 nacked_count: Arc<AtomicUsize>,
3666 claim_failures_remaining: AtomicUsize,
3667 claim_dispatch_empty_once: AtomicBool,
3668 }
3669
3670 impl SignalMailboxStore {
3671 fn new() -> Self {
3672 Self::with_claim_failures(0)
3673 }
3674
3675 fn with_claim_failures(claim_failures: usize) -> Self {
3676 Self::with_failures_and_empty_claim_dispatch(claim_failures, false)
3677 }
3678
3679 fn with_empty_claim_dispatch_once() -> Self {
3680 Self::with_failures_and_empty_claim_dispatch(0, true)
3681 }
3682
3683 fn with_failures_and_empty_claim_dispatch(
3684 claim_failures: usize,
3685 claim_dispatch_empty_once: bool,
3686 ) -> Self {
3687 Self {
3688 inner: InMemoryMailboxStore::new(),
3689 signals: Arc::new(tokio::sync::Mutex::new(VecDeque::new())),
3690 acked_count: Arc::new(AtomicUsize::new(0)),
3691 nacked_count: Arc::new(AtomicUsize::new(0)),
3692 claim_failures_remaining: AtomicUsize::new(claim_failures),
3693 claim_dispatch_empty_once: AtomicBool::new(claim_dispatch_empty_once),
3694 }
3695 }
3696
3697 fn acked_signal_count(&self) -> usize {
3698 self.acked_count.load(Ordering::SeqCst)
3699 }
3700
3701 fn nacked_signal_count(&self) -> usize {
3702 self.nacked_count.load(Ordering::SeqCst)
3703 }
3704 }
3705
3706 #[async_trait::async_trait]
3707 impl MailboxStore for SignalMailboxStore {
3708 async fn enqueue(&self, dispatch: &RunDispatch) -> Result<(), StorageError> {
3709 self.inner.enqueue(dispatch).await?;
3710 self.signals.lock().await.push_back(TestDispatchSignal {
3711 thread_id: dispatch.thread_id.clone(),
3712 dispatch_id: dispatch.dispatch_id.clone(),
3713 });
3714 Ok(())
3715 }
3716
3717 async fn claim(
3718 &self,
3719 thread_id: &str,
3720 consumer_id: &str,
3721 lease_ms: u64,
3722 now: u64,
3723 limit: usize,
3724 ) -> Result<Vec<RunDispatch>, StorageError> {
3725 let remaining = self.claim_failures_remaining.load(Ordering::SeqCst);
3726 if remaining > 0
3727 && self
3728 .claim_failures_remaining
3729 .compare_exchange(remaining, remaining - 1, Ordering::SeqCst, Ordering::SeqCst)
3730 .is_ok()
3731 {
3732 return Err(StorageError::Io("injected claim failure".into()));
3733 }
3734 self.inner
3735 .claim(thread_id, consumer_id, lease_ms, now, limit)
3736 .await
3737 }
3738
3739 async fn claim_dispatch(
3740 &self,
3741 dispatch_id: &str,
3742 consumer_id: &str,
3743 lease_ms: u64,
3744 now: u64,
3745 ) -> Result<Option<RunDispatch>, StorageError> {
3746 if self.claim_dispatch_empty_once.swap(false, Ordering::SeqCst) {
3747 return Ok(None);
3748 }
3749 self.inner
3750 .claim_dispatch(dispatch_id, consumer_id, lease_ms, now)
3751 .await
3752 }
3753
3754 async fn ack(
3755 &self,
3756 dispatch_id: &str,
3757 claim_token: &str,
3758 now: u64,
3759 ) -> Result<(), StorageError> {
3760 self.inner.ack(dispatch_id, claim_token, now).await
3761 }
3762
3763 async fn record_dispatch_start(
3764 &self,
3765 dispatch_id: &str,
3766 claim_token: &str,
3767 dispatch_instance_id: &str,
3768 now: u64,
3769 ) -> Result<(), StorageError> {
3770 self.inner
3771 .record_dispatch_start(dispatch_id, claim_token, dispatch_instance_id, now)
3772 .await
3773 }
3774
3775 async fn record_run_result(
3776 &self,
3777 dispatch_id: &str,
3778 claim_token: &str,
3779 result: &RunDispatchResult,
3780 now: u64,
3781 ) -> Result<(), StorageError> {
3782 self.inner
3783 .record_run_result(dispatch_id, claim_token, result, now)
3784 .await
3785 }
3786
3787 async fn nack(
3788 &self,
3789 dispatch_id: &str,
3790 claim_token: &str,
3791 retry_at: u64,
3792 error: &str,
3793 now: u64,
3794 ) -> Result<(), StorageError> {
3795 self.inner
3796 .nack(dispatch_id, claim_token, retry_at, error, now)
3797 .await
3798 }
3799
3800 async fn dead_letter(
3801 &self,
3802 dispatch_id: &str,
3803 claim_token: &str,
3804 error: &str,
3805 now: u64,
3806 ) -> Result<(), StorageError> {
3807 self.inner
3808 .dead_letter(dispatch_id, claim_token, error, now)
3809 .await
3810 }
3811
3812 async fn cancel(
3813 &self,
3814 dispatch_id: &str,
3815 now: u64,
3816 ) -> Result<Option<RunDispatch>, StorageError> {
3817 self.inner.cancel(dispatch_id, now).await
3818 }
3819
3820 async fn extend_lease(
3821 &self,
3822 dispatch_id: &str,
3823 claim_token: &str,
3824 extension_ms: u64,
3825 now: u64,
3826 ) -> Result<bool, StorageError> {
3827 self.inner
3828 .extend_lease(dispatch_id, claim_token, extension_ms, now)
3829 .await
3830 }
3831
3832 async fn interrupt(
3833 &self,
3834 thread_id: &str,
3835 now: u64,
3836 ) -> Result<MailboxInterrupt, StorageError> {
3837 self.inner.interrupt(thread_id, now).await
3838 }
3839
3840 async fn interrupt_detailed(
3841 &self,
3842 thread_id: &str,
3843 now: u64,
3844 ) -> Result<MailboxInterruptDetails, StorageError> {
3845 self.inner.interrupt_detailed(thread_id, now).await
3846 }
3847
3848 async fn current_dispatch_epoch(&self, thread_id: &str) -> Result<u64, StorageError> {
3849 self.inner.current_dispatch_epoch(thread_id).await
3850 }
3851
3852 async fn supersede_claimed(
3853 &self,
3854 dispatch_id: &str,
3855 claim_token: &str,
3856 now: u64,
3857 reason: &str,
3858 ) -> Result<Option<RunDispatch>, StorageError> {
3859 self.inner
3860 .supersede_claimed(dispatch_id, claim_token, now, reason)
3861 .await
3862 }
3863
3864 async fn load_dispatch(
3865 &self,
3866 dispatch_id: &str,
3867 ) -> Result<Option<RunDispatch>, StorageError> {
3868 self.inner.load_dispatch(dispatch_id).await
3869 }
3870
3871 async fn list_dispatches(
3872 &self,
3873 thread_id: &str,
3874 status_filter: Option<&[RunDispatchStatus]>,
3875 limit: usize,
3876 offset: usize,
3877 ) -> Result<Vec<RunDispatch>, StorageError> {
3878 self.inner
3879 .list_dispatches(thread_id, status_filter, limit, offset)
3880 .await
3881 }
3882
3883 async fn list_terminal_dispatches(
3884 &self,
3885 limit: usize,
3886 offset: usize,
3887 ) -> Result<Vec<RunDispatch>, StorageError> {
3888 self.inner.list_terminal_dispatches(limit, offset).await
3889 }
3890
3891 async fn reclaim_expired_leases(
3892 &self,
3893 now: u64,
3894 limit: usize,
3895 ) -> Result<Vec<RunDispatch>, StorageError> {
3896 self.inner.reclaim_expired_leases(now, limit).await
3897 }
3898
3899 async fn purge_terminal(&self, older_than: u64) -> Result<usize, StorageError> {
3900 self.inner.purge_terminal(older_than).await
3901 }
3902
3903 async fn queued_thread_ids(&self) -> Result<Vec<String>, StorageError> {
3904 self.inner.queued_thread_ids().await
3905 }
3906
3907 fn supports_dispatch_signals(&self) -> bool {
3908 true
3909 }
3910
3911 async fn pull_dispatch_signals(
3912 &self,
3913 max: usize,
3914 _expires: Duration,
3915 ) -> Result<Vec<awaken_contract::contract::mailbox::DispatchSignalEntry>, StorageError>
3916 {
3917 let mut signals = self.signals.lock().await;
3918 let mut entries = Vec::new();
3919 for _ in 0..max {
3920 let Some(signal) = signals.pop_front() else {
3921 break;
3922 };
3923 entries.push(awaken_contract::contract::mailbox::DispatchSignalEntry {
3924 thread_id: signal.thread_id.clone(),
3925 dispatch_id: signal.dispatch_id.clone(),
3926 receipt: Box::new(TestDispatchSignalReceipt {
3927 signal,
3928 queue: Arc::clone(&self.signals),
3929 acked_count: Arc::clone(&self.acked_count),
3930 nacked_count: Arc::clone(&self.nacked_count),
3931 }),
3932 });
3933 }
3934 Ok(entries)
3935 }
3936 }
3937
3938 struct InterruptOnLoadMailboxStore {
3939 inner: InMemoryMailboxStore,
3940 interrupt_once: AtomicBool,
3941 }
3942
3943 impl InterruptOnLoadMailboxStore {
3944 fn new() -> Self {
3945 Self {
3946 inner: InMemoryMailboxStore::new(),
3947 interrupt_once: AtomicBool::new(true),
3948 }
3949 }
3950 }
3951
3952 #[async_trait::async_trait]
3953 impl MailboxStore for InterruptOnLoadMailboxStore {
3954 async fn enqueue(&self, dispatch: &RunDispatch) -> Result<(), StorageError> {
3955 self.inner.enqueue(dispatch).await
3956 }
3957
3958 async fn claim(
3959 &self,
3960 thread_id: &str,
3961 consumer_id: &str,
3962 lease_ms: u64,
3963 now: u64,
3964 limit: usize,
3965 ) -> Result<Vec<RunDispatch>, StorageError> {
3966 self.inner
3967 .claim(thread_id, consumer_id, lease_ms, now, limit)
3968 .await
3969 }
3970
3971 async fn claim_dispatch(
3972 &self,
3973 dispatch_id: &str,
3974 consumer_id: &str,
3975 lease_ms: u64,
3976 now: u64,
3977 ) -> Result<Option<RunDispatch>, StorageError> {
3978 self.inner
3979 .claim_dispatch(dispatch_id, consumer_id, lease_ms, now)
3980 .await
3981 }
3982
3983 async fn ack(
3984 &self,
3985 dispatch_id: &str,
3986 claim_token: &str,
3987 now: u64,
3988 ) -> Result<(), StorageError> {
3989 self.inner.ack(dispatch_id, claim_token, now).await
3990 }
3991
3992 async fn record_dispatch_start(
3993 &self,
3994 dispatch_id: &str,
3995 claim_token: &str,
3996 dispatch_instance_id: &str,
3997 now: u64,
3998 ) -> Result<(), StorageError> {
3999 self.inner
4000 .record_dispatch_start(dispatch_id, claim_token, dispatch_instance_id, now)
4001 .await
4002 }
4003
4004 async fn record_run_result(
4005 &self,
4006 dispatch_id: &str,
4007 claim_token: &str,
4008 result: &RunDispatchResult,
4009 now: u64,
4010 ) -> Result<(), StorageError> {
4011 self.inner
4012 .record_run_result(dispatch_id, claim_token, result, now)
4013 .await
4014 }
4015
4016 async fn nack(
4017 &self,
4018 dispatch_id: &str,
4019 claim_token: &str,
4020 retry_at: u64,
4021 error: &str,
4022 now: u64,
4023 ) -> Result<(), StorageError> {
4024 self.inner
4025 .nack(dispatch_id, claim_token, retry_at, error, now)
4026 .await
4027 }
4028
4029 async fn dead_letter(
4030 &self,
4031 dispatch_id: &str,
4032 claim_token: &str,
4033 error: &str,
4034 now: u64,
4035 ) -> Result<(), StorageError> {
4036 self.inner
4037 .dead_letter(dispatch_id, claim_token, error, now)
4038 .await
4039 }
4040
4041 async fn cancel(
4042 &self,
4043 dispatch_id: &str,
4044 now: u64,
4045 ) -> Result<Option<RunDispatch>, StorageError> {
4046 self.inner.cancel(dispatch_id, now).await
4047 }
4048
4049 async fn extend_lease(
4050 &self,
4051 dispatch_id: &str,
4052 claim_token: &str,
4053 extension_ms: u64,
4054 now: u64,
4055 ) -> Result<bool, StorageError> {
4056 self.inner
4057 .extend_lease(dispatch_id, claim_token, extension_ms, now)
4058 .await
4059 }
4060
4061 async fn interrupt(
4062 &self,
4063 thread_id: &str,
4064 now: u64,
4065 ) -> Result<MailboxInterrupt, StorageError> {
4066 self.inner.interrupt(thread_id, now).await
4067 }
4068
4069 async fn interrupt_detailed(
4070 &self,
4071 thread_id: &str,
4072 now: u64,
4073 ) -> Result<MailboxInterruptDetails, StorageError> {
4074 self.inner.interrupt_detailed(thread_id, now).await
4075 }
4076
4077 async fn current_dispatch_epoch(&self, thread_id: &str) -> Result<u64, StorageError> {
4078 self.inner.current_dispatch_epoch(thread_id).await
4079 }
4080
4081 async fn supersede_claimed(
4082 &self,
4083 dispatch_id: &str,
4084 claim_token: &str,
4085 now: u64,
4086 reason: &str,
4087 ) -> Result<Option<RunDispatch>, StorageError> {
4088 self.inner
4089 .supersede_claimed(dispatch_id, claim_token, now, reason)
4090 .await
4091 }
4092
4093 async fn load_dispatch(
4094 &self,
4095 dispatch_id: &str,
4096 ) -> Result<Option<RunDispatch>, StorageError> {
4097 let loaded = self.inner.load_dispatch(dispatch_id).await?;
4098 if let Some(dispatch) = loaded.as_ref()
4099 && dispatch.status == RunDispatchStatus::Claimed
4100 && self.interrupt_once.swap(false, Ordering::SeqCst)
4101 {
4102 self.inner.interrupt(&dispatch.thread_id, now_ms()).await?;
4103 }
4104 Ok(loaded)
4105 }
4106
4107 async fn list_dispatches(
4108 &self,
4109 thread_id: &str,
4110 status_filter: Option<&[RunDispatchStatus]>,
4111 limit: usize,
4112 offset: usize,
4113 ) -> Result<Vec<RunDispatch>, StorageError> {
4114 self.inner
4115 .list_dispatches(thread_id, status_filter, limit, offset)
4116 .await
4117 }
4118
4119 async fn count_dispatches_by_status(
4120 &self,
4121 status: RunDispatchStatus,
4122 ) -> Result<usize, StorageError> {
4123 self.inner.count_dispatches_by_status(status).await
4124 }
4125
4126 async fn list_terminal_dispatches(
4127 &self,
4128 limit: usize,
4129 offset: usize,
4130 ) -> Result<Vec<RunDispatch>, StorageError> {
4131 self.inner.list_terminal_dispatches(limit, offset).await
4132 }
4133
4134 async fn reclaim_expired_leases(
4135 &self,
4136 now: u64,
4137 limit: usize,
4138 ) -> Result<Vec<RunDispatch>, StorageError> {
4139 self.inner.reclaim_expired_leases(now, limit).await
4140 }
4141
4142 async fn purge_terminal(&self, older_than: u64) -> Result<usize, StorageError> {
4143 self.inner.purge_terminal(older_than).await
4144 }
4145
4146 async fn queued_thread_ids(&self) -> Result<Vec<String>, StorageError> {
4147 self.inner.queued_thread_ids().await
4148 }
4149 }
4150
4151 fn make_runtime() -> Arc<AgentRuntime> {
4152 Arc::new(AgentRuntime::new(Arc::new(StubResolver)))
4153 }
4154
4155 fn make_mailbox(runtime: Arc<AgentRuntime>, store: Arc<InMemoryMailboxStore>) -> Arc<Mailbox> {
4156 Arc::new(Mailbox::new(
4157 runtime,
4158 store,
4159 Arc::new(InMemoryStore::new()),
4160 "test-consumer".to_string(),
4161 MailboxConfig::default(),
4162 ))
4163 }
4164
4165 fn make_mailbox_with_run_store(
4166 runtime: Arc<AgentRuntime>,
4167 store: Arc<InMemoryMailboxStore>,
4168 run_store: Arc<dyn ThreadRunStore>,
4169 ) -> Arc<Mailbox> {
4170 Arc::new(Mailbox::new(
4171 runtime,
4172 store,
4173 run_store,
4174 "test-consumer".to_string(),
4175 MailboxConfig::default(),
4176 ))
4177 }
4178
4179 struct NoopMailboxRuntime;
4180
4181 #[async_trait::async_trait]
4182 impl RunDispatchExecutor for NoopMailboxRuntime {
4183 async fn run(
4184 &self,
4185 _request: RunRequest,
4186 _sink: Arc<dyn EventSink>,
4187 ) -> Result<AgentRunResult, AgentLoopError> {
4188 panic!("decoupling test must not execute runs")
4189 }
4190
4191 fn cancel(&self, _id: &str) -> bool {
4192 false
4193 }
4194
4195 async fn cancel_and_wait_by_thread(&self, _thread_id: &str) -> bool {
4196 false
4197 }
4198
4199 fn send_decision(&self, _id: &str, _tool_call_id: String, _resume: ToolCallResume) -> bool {
4200 false
4201 }
4202
4203 fn send_messages(&self, _id: &str, _messages: Vec<Message>) -> bool {
4204 false
4205 }
4206 }
4207
4208 struct ImmediateLocalCancelRuntime;
4209
4210 #[async_trait::async_trait]
4211 impl RunDispatchExecutor for ImmediateLocalCancelRuntime {
4212 async fn run(
4213 &self,
4214 _request: RunRequest,
4215 _sink: Arc<dyn EventSink>,
4216 ) -> Result<AgentRunResult, AgentLoopError> {
4217 panic!("local cancel test must not execute runs")
4218 }
4219
4220 fn cancel(&self, _id: &str) -> bool {
4221 true
4222 }
4223
4224 async fn cancel_and_wait_by_thread(&self, _thread_id: &str) -> bool {
4225 true
4226 }
4227
4228 fn send_decision(&self, _id: &str, _tool_call_id: String, _resume: ToolCallResume) -> bool {
4229 false
4230 }
4231
4232 fn send_messages(&self, _id: &str, _messages: Vec<Message>) -> bool {
4233 false
4234 }
4235 }
4236
4237 #[derive(Default)]
4238 struct CountingMailboxRuntime {
4239 run_count: AtomicUsize,
4240 }
4241
4242 impl CountingMailboxRuntime {
4243 fn run_count(&self) -> usize {
4244 self.run_count.load(Ordering::SeqCst)
4245 }
4246 }
4247
4248 #[async_trait::async_trait]
4249 impl RunDispatchExecutor for CountingMailboxRuntime {
4250 async fn run(
4251 &self,
4252 request: RunRequest,
4253 _sink: Arc<dyn EventSink>,
4254 ) -> Result<AgentRunResult, AgentLoopError> {
4255 self.run_count.fetch_add(1, Ordering::SeqCst);
4256 Ok(AgentRunResult {
4257 run_id: request
4258 .continue_run_id
4259 .clone()
4260 .or(request.run_id_hint.clone())
4261 .or(request.dispatch_id.clone())
4262 .unwrap_or_else(|| "counted-run".to_string()),
4263 response: "ok".to_string(),
4264 termination: TerminationReason::NaturalEnd,
4265 steps: 1,
4266 })
4267 }
4268
4269 fn cancel(&self, _id: &str) -> bool {
4270 false
4271 }
4272
4273 async fn cancel_and_wait_by_thread(&self, _thread_id: &str) -> bool {
4274 false
4275 }
4276
4277 fn send_decision(&self, _id: &str, _tool_call_id: String, _resume: ToolCallResume) -> bool {
4278 false
4279 }
4280
4281 fn send_messages(&self, _id: &str, _messages: Vec<Message>) -> bool {
4282 false
4283 }
4284 }
4285
4286 struct RecordedMailboxRequest {
4287 run_mode: RunMode,
4288 adapter: AdapterKind,
4289 dispatch_id: Option<String>,
4290 session_id: Option<String>,
4291 }
4292
4293 #[derive(Default)]
4294 struct RecordingMailboxRuntime {
4295 requests: StdMutex<Vec<RecordedMailboxRequest>>,
4296 }
4297
4298 struct BlockingMailboxRuntime {
4299 run_count: AtomicUsize,
4300 started_tx: tokio::sync::mpsc::UnboundedSender<(usize, Option<String>)>,
4301 release_first: Arc<tokio::sync::Notify>,
4302 }
4303
4304 impl BlockingMailboxRuntime {
4305 fn new(
4306 started_tx: tokio::sync::mpsc::UnboundedSender<(usize, Option<String>)>,
4307 release_first: Arc<tokio::sync::Notify>,
4308 ) -> Self {
4309 Self {
4310 run_count: AtomicUsize::new(0),
4311 started_tx,
4312 release_first,
4313 }
4314 }
4315 }
4316
4317 #[async_trait::async_trait]
4318 impl RunDispatchExecutor for BlockingMailboxRuntime {
4319 async fn run(
4320 &self,
4321 request: RunRequest,
4322 _sink: Arc<dyn EventSink>,
4323 ) -> Result<AgentRunResult, AgentLoopError> {
4324 let ordinal = self.run_count.fetch_add(1, Ordering::SeqCst) + 1;
4325 let _ = self.started_tx.send((ordinal, request.dispatch_id.clone()));
4326 if ordinal == 1 {
4327 self.release_first.notified().await;
4328 }
4329 let run_id = request
4330 .continue_run_id
4331 .clone()
4332 .or(request.run_id_hint.clone())
4333 .or(request.dispatch_id.clone())
4334 .unwrap_or_else(|| format!("blocking-run-{ordinal}"));
4335 Ok(AgentRunResult {
4336 run_id,
4337 response: "ok".to_string(),
4338 termination: TerminationReason::NaturalEnd,
4339 steps: 1,
4340 })
4341 }
4342
4343 fn cancel(&self, _id: &str) -> bool {
4344 false
4345 }
4346
4347 async fn cancel_and_wait_by_thread(&self, _thread_id: &str) -> bool {
4348 false
4349 }
4350
4351 fn send_decision(&self, _id: &str, _tool_call_id: String, _resume: ToolCallResume) -> bool {
4352 false
4353 }
4354
4355 fn send_messages(&self, _id: &str, _messages: Vec<Message>) -> bool {
4356 false
4357 }
4358 }
4359
4360 #[async_trait::async_trait]
4361 impl RunDispatchExecutor for RecordingMailboxRuntime {
4362 async fn run(
4363 &self,
4364 request: RunRequest,
4365 _sink: Arc<dyn EventSink>,
4366 ) -> Result<AgentRunResult, AgentLoopError> {
4367 let run_id = request
4368 .continue_run_id
4369 .clone()
4370 .or(request.run_id_hint.clone())
4371 .unwrap_or_else(|| "recorded-run".to_string());
4372 self.requests
4373 .lock()
4374 .expect("lock poisoned")
4375 .push(RecordedMailboxRequest {
4376 run_mode: request.run_mode,
4377 adapter: request.adapter,
4378 dispatch_id: request.dispatch_id.clone(),
4379 session_id: request.session_id.clone(),
4380 });
4381 Ok(AgentRunResult {
4382 run_id,
4383 response: "ok".to_string(),
4384 termination: TerminationReason::NaturalEnd,
4385 steps: 1,
4386 })
4387 }
4388
4389 fn cancel(&self, _id: &str) -> bool {
4390 false
4391 }
4392
4393 async fn cancel_and_wait_by_thread(&self, _thread_id: &str) -> bool {
4394 false
4395 }
4396
4397 fn send_decision(&self, _id: &str, _tool_call_id: String, _resume: ToolCallResume) -> bool {
4398 false
4399 }
4400
4401 fn send_messages(&self, _id: &str, _messages: Vec<Message>) -> bool {
4402 false
4403 }
4404 }
4405
4406 struct RecordedStoreMailboxRequest {
4407 thread_id: String,
4408 continue_run_id: Option<String>,
4409 run_mode: RunMode,
4410 adapter: AdapterKind,
4411 }
4412
4413 struct RecordingStoreMailboxRuntime {
4414 requests: StdMutex<Vec<RecordedStoreMailboxRequest>>,
4415 }
4416
4417 impl RecordingStoreMailboxRuntime {
4418 fn new(_store: Arc<InMemoryStore>) -> Self {
4419 Self {
4420 requests: StdMutex::new(Vec::new()),
4421 }
4422 }
4423 }
4424
4425 #[async_trait::async_trait]
4426 impl RunDispatchExecutor for RecordingStoreMailboxRuntime {
4427 async fn run(
4428 &self,
4429 request: RunRequest,
4430 _sink: Arc<dyn EventSink>,
4431 ) -> Result<AgentRunResult, AgentLoopError> {
4432 let run_id = request
4433 .continue_run_id
4434 .clone()
4435 .or(request.run_id_hint.clone())
4436 .unwrap_or_else(|| "recorded-run".to_string());
4437 self.requests
4438 .lock()
4439 .expect("lock poisoned")
4440 .push(RecordedStoreMailboxRequest {
4441 thread_id: request.thread_id,
4442 continue_run_id: request.continue_run_id,
4443 run_mode: request.run_mode,
4444 adapter: request.adapter,
4445 });
4446 Ok(AgentRunResult {
4447 run_id,
4448 response: "ok".to_string(),
4449 termination: TerminationReason::NaturalEnd,
4450 steps: 1,
4451 })
4452 }
4453
4454 fn cancel(&self, _id: &str) -> bool {
4455 false
4456 }
4457
4458 async fn cancel_and_wait_by_thread(&self, _thread_id: &str) -> bool {
4459 false
4460 }
4461
4462 fn send_decision(&self, _id: &str, _tool_call_id: String, _resume: ToolCallResume) -> bool {
4463 false
4464 }
4465
4466 fn send_messages(&self, _id: &str, _messages: Vec<Message>) -> bool {
4467 false
4468 }
4469 }
4470
4471 struct ScriptedLlm {
4472 responses: StdMutex<Vec<StreamResult>>,
4473 }
4474
4475 impl ScriptedLlm {
4476 fn new(responses: Vec<StreamResult>) -> Self {
4477 Self {
4478 responses: StdMutex::new(responses),
4479 }
4480 }
4481 }
4482
4483 #[async_trait]
4484 impl LlmExecutor for ScriptedLlm {
4485 async fn execute(
4486 &self,
4487 _request: InferenceRequest,
4488 ) -> Result<StreamResult, InferenceExecutionError> {
4489 let mut responses = self.responses.lock().expect("lock poisoned");
4490 if responses.is_empty() {
4491 Ok(StreamResult {
4492 content: vec![ContentBlock::text("done")],
4493 tool_calls: vec![],
4494 usage: None,
4495 stop_reason: Some(StopReason::EndTurn),
4496 has_incomplete_tool_calls: false,
4497 })
4498 } else {
4499 Ok(responses.remove(0))
4500 }
4501 }
4502
4503 fn name(&self) -> &str {
4504 "scripted"
4505 }
4506 }
4507
4508 struct RecordingLlm {
4509 responses: StdMutex<Vec<StreamResult>>,
4510 requests: Arc<StdMutex<Vec<InferenceRequest>>>,
4511 }
4512
4513 impl RecordingLlm {
4514 fn new(
4515 responses: Vec<StreamResult>,
4516 requests: Arc<StdMutex<Vec<InferenceRequest>>>,
4517 ) -> Self {
4518 Self {
4519 responses: StdMutex::new(responses),
4520 requests,
4521 }
4522 }
4523 }
4524
4525 #[async_trait]
4526 impl LlmExecutor for RecordingLlm {
4527 async fn execute(
4528 &self,
4529 request: InferenceRequest,
4530 ) -> Result<StreamResult, InferenceExecutionError> {
4531 self.requests.lock().expect("lock poisoned").push(request);
4532 let mut responses = self.responses.lock().expect("lock poisoned");
4533 if responses.is_empty() {
4534 Ok(StreamResult {
4535 content: vec![ContentBlock::text("done")],
4536 tool_calls: vec![],
4537 usage: None,
4538 stop_reason: Some(StopReason::EndTurn),
4539 has_incomplete_tool_calls: false,
4540 })
4541 } else {
4542 Ok(responses.remove(0))
4543 }
4544 }
4545
4546 fn name(&self) -> &str {
4547 "recording"
4548 }
4549 }
4550
4551 struct FixedResolver {
4552 agent: ResolvedAgent,
4553 plugins: Vec<Arc<dyn Plugin>>,
4554 }
4555
4556 impl awaken_runtime::AgentResolver for FixedResolver {
4557 fn resolve(&self, _agent_id: &str) -> Result<ResolvedAgent, awaken_runtime::RuntimeError> {
4558 let mut agent = self.agent.clone();
4559 agent.env = build_agent_env(&self.plugins, &agent)?;
4560 Ok(agent)
4561 }
4562 }
4563
4564 struct SpawnShortBgTaskTool {
4565 manager: Arc<BackgroundTaskManager>,
4566 delay: Duration,
4567 }
4568
4569 #[async_trait]
4570 impl Tool for SpawnShortBgTaskTool {
4571 fn descriptor(&self) -> ToolDescriptor {
4572 ToolDescriptor::new("spawn_bg", "spawn_bg", "Spawn a short background task")
4573 }
4574
4575 async fn execute(
4576 &self,
4577 _args: Value,
4578 ctx: &ToolCallContext,
4579 ) -> Result<ToolOutput, ToolError> {
4580 let delay = self.delay;
4581 self.manager
4582 .spawn(
4583 &ctx.run_identity.thread_id,
4584 "bg",
4585 None,
4586 "short task",
4587 TaskParentContext::default(),
4588 move |_task_ctx| async move {
4589 sleep(delay).await;
4590 BackgroundTaskResult::Success(json!({"done": true}))
4591 },
4592 )
4593 .await
4594 .map_err(|e| ToolError::ExecutionFailed(e.to_string()))?;
4595 Ok(ToolResult::success("spawn_bg", json!({"spawned": true})).into())
4596 }
4597 }
4598
4599 struct BlockingTool {
4600 started: StdMutex<Option<tokio::sync::oneshot::Sender<()>>>,
4601 release: tokio::sync::Mutex<Option<tokio::sync::oneshot::Receiver<()>>>,
4602 }
4603
4604 impl BlockingTool {
4605 fn new(
4606 started: tokio::sync::oneshot::Sender<()>,
4607 release: tokio::sync::oneshot::Receiver<()>,
4608 ) -> Self {
4609 Self {
4610 started: StdMutex::new(Some(started)),
4611 release: tokio::sync::Mutex::new(Some(release)),
4612 }
4613 }
4614 }
4615
4616 #[async_trait]
4617 impl Tool for BlockingTool {
4618 fn descriptor(&self) -> ToolDescriptor {
4619 ToolDescriptor::new("block", "block", "wait until released")
4620 }
4621
4622 async fn execute(
4623 &self,
4624 _args: Value,
4625 _ctx: &ToolCallContext,
4626 ) -> Result<ToolOutput, ToolError> {
4627 if let Some(started) = self.started.lock().expect("lock poisoned").take() {
4628 let _ = started.send(());
4629 }
4630 let release = self.release.lock().await.take();
4631 if let Some(release) = release {
4632 let _ = release.await;
4633 }
4634 Ok(ToolResult::success("block", json!({"released": true})).into())
4635 }
4636 }
4637
4638 async fn wait_for_latest_run<F>(
4639 store: &InMemoryStore,
4640 thread_id: &str,
4641 predicate: F,
4642 ) -> RunRecord
4643 where
4644 F: Fn(&RunRecord) -> bool,
4645 {
4646 let deadline = Instant::now() + Duration::from_secs(2);
4647 loop {
4648 if let Some(run) = store
4649 .latest_run(thread_id)
4650 .await
4651 .expect("latest run lookup should succeed")
4652 && predicate(&run)
4653 {
4654 return run;
4655 }
4656
4657 assert!(
4658 Instant::now() < deadline,
4659 "timed out waiting for run predicate on thread {thread_id}"
4660 );
4661 sleep(Duration::from_millis(10)).await;
4662 }
4663 }
4664
4665 async fn wait_for_dispatch<F>(
4666 store: &InMemoryMailboxStore,
4667 dispatch_id: &str,
4668 predicate: F,
4669 ) -> RunDispatch
4670 where
4671 F: Fn(&RunDispatch) -> bool,
4672 {
4673 let deadline = Instant::now() + Duration::from_secs(2);
4674 loop {
4675 if let Some(dispatch) = store
4676 .load_dispatch(dispatch_id)
4677 .await
4678 .expect("mailbox dispatch lookup should succeed")
4679 && predicate(&dispatch)
4680 {
4681 return dispatch;
4682 }
4683
4684 assert!(
4685 Instant::now() < deadline,
4686 "timed out waiting for mailbox dispatch predicate on dispatch {dispatch_id}"
4687 );
4688 sleep(Duration::from_millis(10)).await;
4689 }
4690 }
4691
4692 async fn prepare_queued_dispatch(
4693 mailbox: &Arc<Mailbox>,
4694 thread_id: &str,
4695 content: &str,
4696 ) -> RunDispatch {
4697 let mut request =
4698 RunRequest::new(thread_id, vec![Message::user(content)]).with_agent_id("agent");
4699 let (validated_thread_id, messages) =
4700 validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
4701 .expect("test input should validate");
4702 mailbox
4703 .prepare_run_for_dispatch(&mut request, &validated_thread_id, &messages)
4704 .await
4705 .expect("prepare queued run");
4706 mailbox
4707 .build_dispatch(&request, &validated_thread_id)
4708 .expect("build queued dispatch")
4709 }
4710
4711 async fn enqueue_prepared_dispatch(
4712 mailbox: &Arc<Mailbox>,
4713 store: &InMemoryMailboxStore,
4714 thread_id: &str,
4715 content: &str,
4716 ) -> MailboxSubmitResult {
4717 let dispatch = prepare_queued_dispatch(mailbox, thread_id, content).await;
4718 let result = MailboxSubmitResult {
4719 dispatch_id: dispatch.dispatch_id.clone(),
4720 run_id: dispatch.run_id.clone(),
4721 thread_id: dispatch.thread_id.clone(),
4722 status: MailboxDispatchStatus::Queued,
4723 };
4724 store
4725 .enqueue(&dispatch)
4726 .await
4727 .expect("enqueue queued dispatch");
4728 result
4729 }
4730
4731 fn seeded_waiting_run(run_id: &str, thread_id: &str, agent_id: &str) -> RunRecord {
4732 RunRecord {
4733 run_id: run_id.to_string(),
4734 thread_id: thread_id.to_string(),
4735 agent_id: agent_id.to_string(),
4736 parent_run_id: None,
4737 request: None,
4738 input: None,
4739 output: None,
4740 status: RunStatus::Waiting,
4741 termination_reason: None,
4742 final_output: None,
4743 error_payload: None,
4744 dispatch_id: None,
4745 session_id: None,
4746 transport_request_id: None,
4747 waiting: Some(RunWaitingState {
4748 reason: WaitingReason::BackgroundTasks,
4749 ticket_ids: Vec::new(),
4750 tickets: Vec::new(),
4751 since_dispatch_id: None,
4752 message: None,
4753 }),
4754 outcome: None,
4755 created_at: 1,
4756 started_at: None,
4757 finished_at: None,
4758 updated_at: 1,
4759 steps: 2,
4760 input_tokens: 0,
4761 output_tokens: 0,
4762 state: None,
4763 }
4764 }
4765
4766 #[test]
4769 fn mailbox_config_defaults() {
4770 let config = MailboxConfig::default();
4771 assert_eq!(config.lease_ms, 30_000);
4772 assert_eq!(config.suspended_lease_ms, 600_000);
4773 assert_eq!(config.lease_renewal_interval, Duration::from_secs(10));
4774 assert_eq!(config.sweep_interval, Duration::from_secs(30));
4775 assert_eq!(config.gc_interval, Duration::from_secs(60));
4776 assert_eq!(config.gc_ttl, Duration::from_secs(24 * 60 * 60));
4777 assert_eq!(config.default_max_attempts, 5);
4778 assert_eq!(config.default_retry_delay_ms, 250);
4779 assert_eq!(config.max_retry_delay_ms, 30_000);
4780 }
4781
4782 #[test]
4783 fn dispatch_signal_blocked_nack_delay_backs_off_and_caps() {
4784 assert_eq!(
4785 dispatch_signal_blocked_nack_delay(None),
4786 Duration::from_millis(500)
4787 );
4788 assert_eq!(
4789 dispatch_signal_blocked_nack_delay(Some(3)),
4790 Duration::from_secs(2)
4791 );
4792 assert_eq!(
4793 dispatch_signal_blocked_nack_delay(Some(100)),
4794 Duration::from_secs(30)
4795 );
4796 }
4797
4798 #[test]
4799 fn mailbox_lifecycle_config_defaults() {
4800 let config = MailboxLifecycleConfig::default();
4801 assert_eq!(config.startup_delay, Duration::ZERO);
4802 assert_eq!(config.startup_recovery.max_attempts, 1);
4803 assert_eq!(
4804 config.startup_recovery.retry_delay,
4805 Duration::from_millis(250)
4806 );
4807 assert!(config.maintenance_callback.is_none());
4808 }
4809
4810 #[tokio::test]
4811 async fn start_lifecycle_ready_fails_when_startup_recovery_fails() {
4812 let store = Arc::new(RecoverFlakyMailboxStore::new(1));
4813 let runtime = make_runtime();
4814 let mailbox = Arc::new(Mailbox::new(
4815 runtime,
4816 store,
4817 Arc::new(InMemoryStore::new()),
4818 "test-consumer".to_string(),
4819 MailboxConfig::default(),
4820 ));
4821
4822 let error = match mailbox
4823 .start_lifecycle_ready(MailboxLifecycleConfig {
4824 startup_recovery: MailboxStartupRecoveryConfig {
4825 max_attempts: 1,
4826 retry_delay: Duration::ZERO,
4827 },
4828 ..Default::default()
4829 })
4830 .await
4831 {
4832 Ok(_) => panic!("ready lifecycle should fail when startup recovery fails"),
4833 Err(error) => error,
4834 };
4835
4836 assert!(
4837 error
4838 .to_string()
4839 .contains("injected startup recovery failure")
4840 );
4841 assert!(
4842 !mailbox
4843 .lifecycle_is_running()
4844 .expect("lifecycle state should be readable")
4845 );
4846 }
4847
4848 #[tokio::test]
4849 async fn start_lifecycle_ready_retries_startup_recovery_until_ready() {
4850 let store = Arc::new(RecoverFlakyMailboxStore::new(1));
4851 let runtime = make_runtime();
4852 let mailbox = Arc::new(Mailbox::new(
4853 runtime,
4854 store.clone(),
4855 Arc::new(InMemoryStore::new()),
4856 "test-consumer".to_string(),
4857 MailboxConfig::default(),
4858 ));
4859
4860 let mut request = RunRequest::new("thread-retry-recover", vec![Message::user("recover")])
4861 .with_agent_id("missing-agent");
4862 let (thread_id, messages) =
4863 validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
4864 .unwrap();
4865 mailbox
4866 .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
4867 .await
4868 .expect("prepare queued run");
4869 let dispatch = mailbox
4870 .build_dispatch(&request, &thread_id)
4871 .expect("build queued dispatch");
4872 let dispatch_id = dispatch.dispatch_id.clone();
4873 store
4874 .enqueue(&dispatch)
4875 .await
4876 .expect("enqueue queued dispatch");
4877
4878 let handle = mailbox
4879 .start_lifecycle_ready(MailboxLifecycleConfig {
4880 startup_recovery: MailboxStartupRecoveryConfig {
4881 max_attempts: 2,
4882 retry_delay: Duration::ZERO,
4883 },
4884 ..Default::default()
4885 })
4886 .await
4887 .expect("ready lifecycle should retry startup recovery");
4888
4889 let recovered = wait_for_dispatch(&store.inner, &dispatch_id, |dispatch| {
4890 dispatch.status == RunDispatchStatus::DeadLetter
4891 })
4892 .await;
4893 assert_eq!(recovered.status, RunDispatchStatus::DeadLetter);
4894 handle.shutdown().await.expect("shutdown lifecycle");
4895 }
4896
4897 #[tokio::test]
4898 async fn start_lifecycle_ready_serializes_concurrent_recovery() {
4899 let store = Arc::new(RecoverFlakyMailboxStore::new(0));
4900 let runtime = make_runtime();
4901 let mailbox = Arc::new(Mailbox::new(
4902 runtime,
4903 store.clone(),
4904 Arc::new(InMemoryStore::new()),
4905 "test-consumer".to_string(),
4906 MailboxConfig::default(),
4907 ));
4908
4909 let mut starters = Vec::new();
4910 for _ in 0..32 {
4911 let mailbox = Arc::clone(&mailbox);
4912 starters.push(tokio::spawn(async move {
4913 mailbox
4914 .start_lifecycle_ready(MailboxLifecycleConfig::default())
4915 .await
4916 }));
4917 }
4918
4919 let mut handles = Vec::new();
4920 for starter in starters {
4921 handles.push(
4922 starter
4923 .await
4924 .expect("starter task should not panic")
4925 .expect("ready lifecycle should start"),
4926 );
4927 }
4928
4929 assert_eq!(
4930 store.reclaim_calls(),
4931 1,
4932 "concurrent ready starts should run startup recovery once"
4933 );
4934 assert!(handles.iter().all(MailboxLifecycleHandle::is_running));
4935 handles[0].shutdown().await.expect("shutdown lifecycle");
4936 assert!(handles.iter().all(|handle| !handle.is_running()));
4937 }
4938
4939 #[tokio::test]
4940 async fn start_lifecycle_does_not_bypass_ready_transition() {
4941 let store = Arc::new(RecoverFlakyMailboxStore::new(0));
4942 let runtime = make_runtime();
4943 let mailbox = Arc::new(Mailbox::new(
4944 runtime,
4945 store.clone(),
4946 Arc::new(InMemoryStore::new()),
4947 "test-consumer".to_string(),
4948 MailboxConfig::default(),
4949 ));
4950
4951 let ready_mailbox = Arc::clone(&mailbox);
4952 let ready = tokio::spawn(async move {
4953 ready_mailbox
4954 .start_lifecycle_ready(MailboxLifecycleConfig {
4955 startup_delay: Duration::from_millis(75),
4956 startup_recovery: MailboxStartupRecoveryConfig {
4957 max_attempts: 1,
4958 retry_delay: Duration::ZERO,
4959 },
4960 ..Default::default()
4961 })
4962 .await
4963 });
4964 sleep(Duration::from_millis(10)).await;
4965
4966 let err = match mailbox.start_lifecycle(MailboxLifecycleConfig::default()) {
4967 Ok(_) => panic!("sync start must not race ready startup"),
4968 Err(error) => error,
4969 };
4970 assert!(
4971 err.to_string()
4972 .contains("lifecycle transition is already running")
4973 );
4974
4975 let handle = ready
4976 .await
4977 .expect("ready task should not panic")
4978 .expect("ready lifecycle should start");
4979 assert_eq!(
4980 store.reclaim_calls(),
4981 1,
4982 "ready recovery should not be duplicated by sync start"
4983 );
4984 handle.shutdown().await.expect("shutdown lifecycle");
4985 }
4986
4987 #[tokio::test]
4988 async fn start_lifecycle_is_idempotent_and_drop_does_not_abort_recovery() {
4989 let store = make_store();
4990 let runtime = make_runtime();
4991 let mailbox = make_mailbox(runtime, store.clone());
4992
4993 let mut request = RunRequest::new("thread-drop-recover", vec![Message::user("recover")])
4994 .with_agent_id("missing-agent");
4995 let (thread_id, messages) =
4996 validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
4997 .unwrap();
4998 mailbox
4999 .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
5000 .await
5001 .expect("prepare queued run");
5002 let dispatch = mailbox
5003 .build_dispatch(&request, &thread_id)
5004 .expect("build queued dispatch");
5005 let dispatch_id = dispatch.dispatch_id.clone();
5006 store
5007 .enqueue(&dispatch)
5008 .await
5009 .expect("enqueue queued dispatch");
5010
5011 let handle = mailbox
5012 .start_lifecycle(MailboxLifecycleConfig {
5013 startup_delay: Duration::from_millis(10),
5014 ..Default::default()
5015 })
5016 .expect("lifecycle start should succeed");
5017 let duplicate = mailbox
5018 .start_lifecycle(MailboxLifecycleConfig::default())
5019 .expect("duplicate lifecycle start should be a no-op");
5020 assert!(handle.is_running());
5021 assert!(duplicate.is_running());
5022
5023 drop(handle);
5024 drop(duplicate);
5025
5026 wait_for_dispatch(&store, &dispatch_id, |dispatch| {
5027 dispatch.status == RunDispatchStatus::DeadLetter
5028 })
5029 .await;
5030
5031 let cleanup = mailbox
5032 .start_lifecycle(MailboxLifecycleConfig::default())
5033 .expect("should return the existing lifecycle handle");
5034 cleanup.shutdown().await.expect("shutdown lifecycle");
5035 assert!(!cleanup.is_running());
5036 }
5037
5038 #[tokio::test]
5039 async fn start_lifecycle_explicit_abort_allows_restart() {
5040 let store = make_store();
5041 let runtime = make_runtime();
5042 let mailbox = make_mailbox(runtime, store);
5043
5044 let first = mailbox
5045 .start_lifecycle(MailboxLifecycleConfig::default())
5046 .expect("first lifecycle start should succeed");
5047 assert!(first.is_running());
5048 first.shutdown().await.expect("shutdown first lifecycle");
5049 assert!(!first.is_running());
5050
5051 let second = mailbox
5052 .start_lifecycle(MailboxLifecycleConfig::default())
5053 .expect("lifecycle should restart after explicit abort");
5054 assert!(second.is_running());
5055 second.shutdown().await.expect("shutdown second lifecycle");
5056 assert!(!second.is_running());
5057 }
5058
5059 #[tokio::test]
5060 async fn maintenance_callback_runs_on_gc_tick() {
5061 let store = make_store();
5062 let runtime = make_runtime();
5063 let mailbox = Arc::new(Mailbox::new(
5064 runtime,
5065 store,
5066 Arc::new(InMemoryStore::new()),
5067 "test-consumer".to_string(),
5068 MailboxConfig {
5069 gc_interval: Duration::from_millis(10),
5070 sweep_interval: Duration::from_secs(60),
5071 ..Default::default()
5072 },
5073 ));
5074 let calls = Arc::new(AtomicUsize::new(0));
5075 let calls_for_hook = Arc::clone(&calls);
5076 let handle = mailbox
5077 .start_lifecycle(MailboxLifecycleConfig {
5078 maintenance_callback: Some(Arc::new(move || {
5079 calls_for_hook.fetch_add(1, Ordering::SeqCst);
5080 })),
5081 ..Default::default()
5082 })
5083 .expect("lifecycle should start");
5084
5085 let deadline = Instant::now() + Duration::from_secs(1);
5086 while calls.load(Ordering::SeqCst) == 0 {
5087 assert!(
5088 Instant::now() < deadline,
5089 "maintenance callback did not run"
5090 );
5091 sleep(Duration::from_millis(5)).await;
5092 }
5093 handle.shutdown().await.expect("shutdown lifecycle");
5094 }
5095
5096 #[tokio::test]
5097 async fn start_lifecycle_handle_drop_keeps_lifecycle_running() {
5098 let store = make_store();
5099 let runtime = make_runtime();
5100 let mailbox = make_mailbox(runtime, store);
5101
5102 let handle = mailbox
5103 .start_lifecycle(MailboxLifecycleConfig::default())
5104 .expect("lifecycle should start");
5105 assert!(handle.is_running());
5106 drop(handle);
5107
5108 let handle = mailbox
5109 .start_lifecycle(MailboxLifecycleConfig::default())
5110 .expect("lifecycle should still be running after handle drop");
5111 assert!(handle.is_running());
5112 handle.shutdown().await.expect("shutdown lifecycle");
5113 }
5114
5115 #[tokio::test]
5116 async fn lifecycle_shutdown_waits_for_maintenance_to_quiesce() {
5117 let store = make_store();
5118 let runtime = make_runtime();
5119 let mailbox = Arc::new(Mailbox::new(
5120 runtime,
5121 store,
5122 Arc::new(InMemoryStore::new()),
5123 "test-consumer".to_string(),
5124 MailboxConfig {
5125 gc_interval: Duration::from_millis(10),
5126 sweep_interval: Duration::from_secs(60),
5127 ..Default::default()
5128 },
5129 ));
5130 let calls = Arc::new(AtomicUsize::new(0));
5131 let calls_for_hook = Arc::clone(&calls);
5132 let handle = mailbox
5133 .start_lifecycle(MailboxLifecycleConfig {
5134 maintenance_callback: Some(Arc::new(move || {
5135 calls_for_hook.fetch_add(1, Ordering::SeqCst);
5136 })),
5137 ..Default::default()
5138 })
5139 .expect("lifecycle should start");
5140
5141 let deadline = Instant::now() + Duration::from_secs(1);
5142 while calls.load(Ordering::SeqCst) == 0 {
5143 assert!(
5144 Instant::now() < deadline,
5145 "maintenance callback did not run"
5146 );
5147 sleep(Duration::from_millis(5)).await;
5148 }
5149
5150 handle.shutdown().await.expect("shutdown should quiesce");
5151 assert!(!handle.is_running());
5152 let calls_after_shutdown = calls.load(Ordering::SeqCst);
5153 sleep(Duration::from_millis(40)).await;
5154 assert_eq!(
5155 calls.load(Ordering::SeqCst),
5156 calls_after_shutdown,
5157 "maintenance callback should not run after shutdown completes"
5158 );
5159 }
5160
5161 #[tokio::test]
5162 async fn concurrent_start_lifecycle_is_idempotent() {
5163 let store = make_store();
5164 let runtime = make_runtime();
5165 let mailbox = make_mailbox(runtime, store);
5166
5167 let mut joins = Vec::new();
5168 for _ in 0..32 {
5169 let mb = Arc::clone(&mailbox);
5170 joins.push(tokio::spawn(async move {
5171 mb.start_lifecycle(MailboxLifecycleConfig::default())
5172 }));
5173 }
5174
5175 let mut handles = Vec::new();
5176 for join in joins {
5177 match join.await.expect("start task should not panic") {
5178 Ok(handle) => handles.push(handle),
5179 Err(err) => panic!("idempotent lifecycle start should not fail: {err}"),
5180 }
5181 }
5182
5183 assert_eq!(handles.len(), 32, "all concurrent starters get a handle");
5184 assert!(handles.iter().all(MailboxLifecycleHandle::is_running));
5185 handles[0].shutdown().await.expect("shutdown lifecycle");
5186 assert!(handles.iter().all(|handle| !handle.is_running()));
5187 }
5188
5189 #[tokio::test]
5190 async fn start_lifecycle_runs_startup_recovery_for_existing_queued_dispatches() {
5191 let store = make_store();
5192 let runtime = make_runtime();
5193 let mailbox = make_mailbox(runtime, store.clone());
5194
5195 let mut request = RunRequest::new("thread-recover", vec![Message::user("recover me")])
5196 .with_agent_id("missing-agent");
5197 let (thread_id, messages) =
5198 validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
5199 .unwrap();
5200 mailbox
5201 .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
5202 .await
5203 .expect("prepare queued run");
5204 let dispatch = mailbox
5205 .build_dispatch(&request, &thread_id)
5206 .expect("build queued dispatch");
5207 let dispatch_id = dispatch.dispatch_id.clone();
5208 store
5209 .enqueue(&dispatch)
5210 .await
5211 .expect("enqueue queued dispatch");
5212
5213 let handle = mailbox
5214 .start_lifecycle(MailboxLifecycleConfig::default())
5215 .expect("lifecycle should start");
5216
5217 let recovered = wait_for_dispatch(&store, &dispatch_id, |dispatch| {
5218 dispatch.status == RunDispatchStatus::DeadLetter
5219 })
5220 .await;
5221
5222 assert_eq!(recovered.status, RunDispatchStatus::DeadLetter);
5223 assert!(
5224 recovered
5225 .last_error
5226 .as_deref()
5227 .is_some_and(|error| error.contains("missing-agent")),
5228 "dead-letter error should preserve the runtime failure: {recovered:?}"
5229 );
5230 handle.shutdown().await.expect("shutdown lifecycle");
5231 }
5232
5233 #[tokio::test]
5234 async fn start_lifecycle_reclaims_expired_claimed_dispatches_and_executes_them() {
5235 let store = make_store();
5236 let runtime = make_runtime();
5237 let mailbox = make_mailbox(runtime, store.clone());
5238
5239 let mut request = RunRequest::new("thread-stale", vec![Message::user("recover stale")])
5240 .with_agent_id("missing-agent");
5241 let (thread_id, messages) =
5242 validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
5243 .unwrap();
5244 mailbox
5245 .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
5246 .await
5247 .expect("prepare stale run");
5248 let dispatch = mailbox
5249 .build_dispatch(&request, &thread_id)
5250 .expect("build stale claimed dispatch");
5251 let dispatch_id = dispatch.dispatch_id.clone();
5252 let claim_now = dispatch.available_at;
5253 store
5254 .enqueue(&dispatch)
5255 .await
5256 .expect("enqueue queued dispatch");
5257 let claimed = store
5258 .claim("thread-stale", "dead-consumer", 1, claim_now, 1)
5259 .await
5260 .expect("claim dispatch before simulated crash");
5261 assert_eq!(claimed.len(), 1);
5262 assert_eq!(claimed[0].status, RunDispatchStatus::Claimed);
5263 assert_eq!(claimed[0].lease_until, Some(claim_now + 1));
5264 sleep(Duration::from_millis(2)).await;
5265
5266 let handle = mailbox
5267 .start_lifecycle(MailboxLifecycleConfig::default())
5268 .expect("lifecycle should start");
5269
5270 let recovered = wait_for_dispatch(&store, &dispatch_id, |dispatch| {
5271 dispatch.status == RunDispatchStatus::DeadLetter
5272 && dispatch.run_status == Some(RunStatus::Done)
5273 })
5274 .await;
5275
5276 assert_eq!(recovered.status, RunDispatchStatus::DeadLetter);
5277 assert_eq!(recovered.attempt_count, 1);
5278 let run_id = recovered.run_id.as_str();
5279 assert_ne!(
5280 run_id, dispatch_id,
5281 "recovered stale dispatches should also keep run id separate from mailbox dispatch id"
5282 );
5283 assert!(recovered.dispatch_instance_id.is_some());
5284 assert!(matches!(
5285 recovered.termination,
5286 Some(TerminationReason::Error(ref message)) if message.contains("missing-agent")
5287 ));
5288 assert!(
5289 recovered
5290 .run_error
5291 .as_deref()
5292 .is_some_and(|error| error.contains("missing-agent"))
5293 );
5294 handle.shutdown().await.expect("shutdown lifecycle");
5295 }
5296
5297 #[tokio::test]
5298 async fn dispatch_signal_loop_claims_and_executes_queued_dispatch() {
5299 let store = Arc::new(SignalMailboxStore::new());
5300 let run_store = Arc::new(InMemoryStore::new());
5301 let runtime = Arc::new(RecordingMailboxRuntime::default());
5302 let mailbox = Arc::new(Mailbox::new_with_executor(
5303 runtime,
5304 store.clone(),
5305 run_store.clone(),
5306 "signal-consumer".to_string(),
5307 MailboxConfig::default(),
5308 ));
5309
5310 let mut request = RunRequest::new("thread-signal-loop", vec![Message::user("wake")])
5311 .with_agent_id("agent");
5312 let (thread_id, messages) =
5313 validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
5314 .expect("input should validate");
5315 mailbox
5316 .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
5317 .await
5318 .expect("prepare run");
5319 let dispatch = mailbox
5320 .build_dispatch(&request, &thread_id)
5321 .expect("build dispatch");
5322 let dispatch_id = dispatch.dispatch_id.clone();
5323 store.enqueue(&dispatch).await.expect("enqueue dispatch");
5324
5325 let signal_loop = tokio::spawn(Arc::clone(&mailbox).run_dispatch_signal_loop());
5326 let deadline = Instant::now() + Duration::from_secs(2);
5327 let acked = loop {
5328 if let Some(dispatch) = store
5329 .load_dispatch(&dispatch_id)
5330 .await
5331 .expect("dispatch lookup should succeed")
5332 && dispatch.status == RunDispatchStatus::Acked
5333 {
5334 break dispatch;
5335 }
5336 assert!(
5337 Instant::now() < deadline,
5338 "timed out waiting for dispatch signal loop"
5339 );
5340 sleep(Duration::from_millis(10)).await;
5341 };
5342 signal_loop.abort();
5343
5344 assert_eq!(acked.status, RunDispatchStatus::Acked);
5345 assert_eq!(store.acked_signal_count(), 1);
5346 }
5347
5348 #[tokio::test]
5349 async fn dispatch_signal_loop_nacks_and_redelivers_after_claim_error() {
5350 let store = Arc::new(SignalMailboxStore::with_claim_failures(1));
5351 let run_store = Arc::new(InMemoryStore::new());
5352 let runtime = Arc::new(RecordingMailboxRuntime::default());
5353 let mailbox = Arc::new(Mailbox::new_with_executor(
5354 runtime,
5355 store.clone(),
5356 run_store.clone(),
5357 "signal-consumer".to_string(),
5358 MailboxConfig::default(),
5359 ));
5360
5361 let mut request = RunRequest::new("thread-signal-redeliver", vec![Message::user("wake")])
5362 .with_agent_id("agent");
5363 let (thread_id, messages) =
5364 validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
5365 .expect("input should validate");
5366 mailbox
5367 .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
5368 .await
5369 .expect("prepare run");
5370 let dispatch = mailbox
5371 .build_dispatch(&request, &thread_id)
5372 .expect("build dispatch");
5373 let dispatch_id = dispatch.dispatch_id.clone();
5374 store.enqueue(&dispatch).await.expect("enqueue dispatch");
5375
5376 let signal_loop = tokio::spawn(Arc::clone(&mailbox).run_dispatch_signal_loop());
5377 let deadline = Instant::now() + Duration::from_secs(2);
5378 loop {
5379 let dispatch = store
5380 .load_dispatch(&dispatch_id)
5381 .await
5382 .expect("dispatch lookup should succeed")
5383 .expect("dispatch should exist");
5384 if dispatch.status == RunDispatchStatus::Acked {
5385 break;
5386 }
5387 assert!(
5388 Instant::now() < deadline,
5389 "timed out waiting for redelivered dispatch signal"
5390 );
5391 sleep(Duration::from_millis(10)).await;
5392 }
5393 signal_loop.abort();
5394
5395 assert_eq!(store.nacked_signal_count(), 1);
5396 assert_eq!(store.acked_signal_count(), 1);
5397 }
5398
5399 #[tokio::test]
5400 async fn dispatch_signal_loop_nacks_when_signal_is_blocked_by_active_claim() {
5401 let store = Arc::new(SignalMailboxStore::new());
5402 let run_store = Arc::new(InMemoryStore::new());
5403 let runtime = Arc::new(RecordingMailboxRuntime::default());
5404 let mailbox = Arc::new(Mailbox::new_with_executor(
5405 runtime,
5406 store.clone(),
5407 run_store.clone(),
5408 "signal-consumer".to_string(),
5409 MailboxConfig::default(),
5410 ));
5411
5412 let mut active = RunRequest::new("thread-signal-blocked", vec![Message::user("active")])
5413 .with_agent_id("agent");
5414 let (thread_id, active_messages) =
5415 validate_run_inputs(active.thread_id.clone(), active.messages.clone(), false)
5416 .expect("active input should validate");
5417 mailbox
5418 .prepare_run_for_dispatch(&mut active, &thread_id, &active_messages)
5419 .await
5420 .expect("prepare active run");
5421 let active_dispatch = mailbox
5422 .build_dispatch(&active, &thread_id)
5423 .expect("build active dispatch");
5424 let active_dispatch_id = active_dispatch.dispatch_id.clone();
5425 store
5426 .enqueue(&active_dispatch)
5427 .await
5428 .expect("enqueue active dispatch");
5429 let claimed = store
5430 .claim(&thread_id, "remote-owner", 30_000, now_ms(), 1)
5431 .await
5432 .expect("claim active dispatch");
5433 assert_eq!(claimed.len(), 1);
5434 let active_claim_token = claimed[0].claim_token.clone().unwrap();
5435
5436 let mut queued = RunRequest::new("thread-signal-blocked", vec![Message::user("queued")])
5437 .with_agent_id("agent");
5438 let (_, queued_messages) =
5439 validate_run_inputs(queued.thread_id.clone(), queued.messages.clone(), false)
5440 .expect("queued input should validate");
5441 mailbox
5442 .prepare_run_for_dispatch(&mut queued, &thread_id, &queued_messages)
5443 .await
5444 .expect("prepare queued run");
5445 let queued_dispatch = mailbox
5446 .build_dispatch(&queued, &thread_id)
5447 .expect("build queued dispatch");
5448 let queued_dispatch_id = queued_dispatch.dispatch_id.clone();
5449 store
5450 .enqueue(&queued_dispatch)
5451 .await
5452 .expect("enqueue queued dispatch");
5453
5454 let signal_loop = tokio::spawn(Arc::clone(&mailbox).run_dispatch_signal_loop());
5455 let deadline = Instant::now() + Duration::from_secs(2);
5456 loop {
5457 if store.nacked_signal_count() > 0 {
5458 break;
5459 }
5460 assert!(
5461 Instant::now() < deadline,
5462 "queued signal blocked by an active claim must be nacked for redelivery"
5463 );
5464 sleep(Duration::from_millis(10)).await;
5465 }
5466
5467 let queued_before_release = store
5468 .load_dispatch(&queued_dispatch_id)
5469 .await
5470 .expect("queued dispatch lookup")
5471 .expect("queued dispatch exists");
5472 assert_eq!(queued_before_release.status, RunDispatchStatus::Queued);
5473
5474 store
5475 .ack(&active_dispatch_id, &active_claim_token, now_ms())
5476 .await
5477 .expect("release active claim");
5478
5479 let deadline = Instant::now() + Duration::from_secs(2);
5480 loop {
5481 let dispatch = store
5482 .load_dispatch(&queued_dispatch_id)
5483 .await
5484 .expect("queued dispatch lookup")
5485 .expect("queued dispatch exists");
5486 if dispatch.status == RunDispatchStatus::Acked {
5487 break;
5488 }
5489 assert!(
5490 Instant::now() < deadline,
5491 "redelivered signal should claim after active claim releases"
5492 );
5493 sleep(Duration::from_millis(10)).await;
5494 }
5495 signal_loop.abort();
5496
5497 assert!(
5498 store.nacked_signal_count() >= 1,
5499 "blocked queued signal must be nacked at least once"
5500 );
5501 assert!(
5502 store.acked_signal_count() >= 2,
5503 "active signal and final queued signal should both be acked"
5504 );
5505 }
5506
5507 #[test]
5508 fn run_request_fields() {
5509 let req = RunRequest::new("t-1", vec![Message::user("hello")]).with_agent_id("agent-a");
5510 assert_eq!(req.thread_id, "t-1");
5511 assert_eq!(req.agent_id.as_deref(), Some("agent-a"));
5512 assert_eq!(req.messages.len(), 1);
5513 assert_eq!(req.run_mode, RunMode::Foreground);
5514 assert_eq!(req.adapter, AdapterKind::Internal);
5515 }
5516
5517 #[test]
5518 fn run_spec_validation_empty_messages_errors() {
5519 let result = validate_run_inputs("thread-1".into(), vec![], false);
5520 assert!(result.is_err());
5521 assert!(matches!(result.unwrap_err(), MailboxError::Validation(_)));
5522 }
5523
5524 #[test]
5525 fn run_spec_validation_allows_decision_only_resume() {
5526 let result = validate_run_inputs("thread-1".into(), vec![], true);
5527 assert!(result.is_ok());
5528 let (thread_id, messages) = result.unwrap();
5529 assert_eq!(thread_id, "thread-1");
5530 assert!(messages.is_empty());
5531 }
5532
5533 #[test]
5534 fn run_spec_validation_blank_thread_id_generates_new() {
5535 let result = validate_run_inputs(" ".into(), vec![Message::user("hi")], false);
5536 assert!(result.is_ok());
5537 let (thread_id, _) = result.unwrap();
5538 assert!(!thread_id.is_empty());
5539 assert_ne!(thread_id.trim(), "");
5540 }
5541
5542 #[test]
5543 fn run_spec_validation_trims_thread_id() {
5544 let result = validate_run_inputs(" my-thread ".into(), vec![Message::user("hi")], false);
5545 assert!(result.is_ok());
5546 let (thread_id, _) = result.unwrap();
5547 assert_eq!(thread_id, "my-thread");
5548 }
5549
5550 #[test]
5551 fn dispatch_status_enum_variants() {
5552 let running = MailboxDispatchStatus::Running;
5553 let queued = MailboxDispatchStatus::Queued;
5554 assert!(matches!(running, MailboxDispatchStatus::Running));
5555 assert!(matches!(queued, MailboxDispatchStatus::Queued));
5556 }
5557
5558 #[test]
5559 fn mailbox_construction_depends_on_runtime_boundary_not_agent_runtime() {
5560 let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
5561 let mailbox = Mailbox::new_with_executor(
5562 runtime,
5563 make_store(),
5564 Arc::new(InMemoryStore::new()),
5565 "decoupled-consumer".to_string(),
5566 MailboxConfig::default(),
5567 );
5568
5569 assert_eq!(mailbox.consumer_id, "decoupled-consumer");
5570 }
5571
5572 #[tokio::test]
5573 async fn submit_background_enqueues_dispatch() {
5574 let store = make_store();
5575 let runtime = make_runtime();
5576 let mailbox = make_mailbox(runtime, store.clone());
5577
5578 let request =
5579 RunRequest::new("thread-1", vec![Message::user("hello")]).with_agent_id("agent-1");
5580 let result = mailbox.submit_background(request).await.unwrap();
5581
5582 assert_eq!(result.thread_id, "thread-1");
5583 assert!(!result.dispatch_id.is_empty());
5584 assert!(!result.run_id.is_empty());
5585 assert_ne!(result.dispatch_id, result.run_id);
5586
5587 let dispatches = store
5589 .list_dispatches("thread-1", None, 100, 0)
5590 .await
5591 .unwrap();
5592 assert!(!dispatches.is_empty());
5593 assert_eq!(dispatches[0].run_id, result.run_id);
5594 }
5595
5596 #[tokio::test]
5597 async fn submit_background_delivers_scheduled_policy_context() {
5598 let store = make_store();
5599 let runtime = Arc::new(RecordingMailboxRuntime::default());
5600 let mailbox = Arc::new(Mailbox::new(
5601 runtime.clone(),
5602 store,
5603 Arc::new(InMemoryStore::new()),
5604 "recording-consumer".to_string(),
5605 MailboxConfig::default(),
5606 ));
5607
5608 let result = mailbox
5609 .submit_background(
5610 RunRequest::new("thread-policy-bg", vec![Message::user("hello")])
5611 .with_agent_id("agent-1")
5612 .with_adapter(AdapterKind::Acp),
5613 )
5614 .await
5615 .expect("background submit should enqueue");
5616
5617 let deadline = Instant::now() + Duration::from_secs(1);
5618 loop {
5619 if !runtime.requests.lock().expect("lock poisoned").is_empty() {
5620 break;
5621 }
5622 assert!(Instant::now() < deadline, "runtime did not receive request");
5623 sleep(Duration::from_millis(5)).await;
5624 }
5625
5626 let requests = runtime.requests.lock().expect("lock poisoned");
5627 assert_eq!(requests.len(), 1);
5628 assert_eq!(requests[0].run_mode, RunMode::Scheduled);
5629 assert_eq!(requests[0].adapter, AdapterKind::Acp);
5630 assert_eq!(
5631 requests[0].dispatch_id.as_deref(),
5632 Some(result.dispatch_id.as_str())
5633 );
5634 assert!(
5635 requests[0].session_id.is_some(),
5636 "dispatch session id should be set"
5637 );
5638 }
5639
5640 #[tokio::test]
5641 async fn prepare_run_for_dispatch_precreates_created_run_and_thread_projection() {
5642 let thread_store = Arc::new(InMemoryStore::new());
5643 let runtime = Arc::new(
5644 AgentRuntime::new(Arc::new(StubResolver))
5645 .with_thread_run_store(thread_store.clone() as Arc<dyn ThreadRunStore>),
5646 );
5647 let mailbox_store = make_store();
5648 let mailbox = make_mailbox_with_run_store(
5649 runtime,
5650 mailbox_store,
5651 thread_store.clone() as Arc<dyn ThreadRunStore>,
5652 );
5653 let mut request = RunRequest::new("thread-created", vec![Message::user("plan this")])
5654 .with_agent_id("agent-created")
5655 .with_transport_request_id("transport-created");
5656 let (thread_id, messages) =
5657 validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
5658 .unwrap();
5659
5660 let run_id = mailbox
5661 .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
5662 .await
5663 .expect("precreate");
5664
5665 assert_eq!(request.run_id_hint.as_deref(), Some(run_id.as_str()));
5666 let run = thread_store
5667 .load_run(&run_id)
5668 .await
5669 .expect("load run")
5670 .expect("created run");
5671 assert_eq!(run.status, RunStatus::Created);
5672 assert_eq!(run.agent_id, "agent-created");
5673 let request_snapshot = run.request.as_ref().unwrap();
5674 assert!(
5675 !request_snapshot.input_message_ids.is_empty(),
5676 "new run snapshots should reference thread messages instead of duplicating bodies"
5677 );
5678 assert_eq!(request_snapshot.input_message_count, 1);
5679 assert_eq!(
5680 request_snapshot.input_message_ids,
5681 vec![messages[0].id.clone().expect("message id")]
5682 );
5683 let input = run.input.as_ref().expect("run input message range");
5684 assert_eq!(input.thread_id, "thread-created");
5685 assert_eq!(input.range.unwrap().from_seq, 1);
5686 assert_eq!(input.range.unwrap().to_seq, 1);
5687 assert_eq!(
5688 input.trigger_message_ids,
5689 vec![messages[0].id.clone().expect("message id")]
5690 );
5691 assert_eq!(
5692 run.request
5693 .as_ref()
5694 .unwrap()
5695 .transport_request_id
5696 .as_deref(),
5697 Some("transport-created")
5698 );
5699 let thread = thread_store
5700 .load_thread("thread-created")
5701 .await
5702 .expect("load thread")
5703 .expect("thread projection");
5704 assert_eq!(thread.open_run_id.as_deref(), Some(run_id.as_str()));
5705 assert_eq!(thread.latest_run_id.as_deref(), Some(run_id.as_str()));
5706 assert!(thread.active_run_id.is_none());
5707 }
5708
5709 #[tokio::test]
5710 async fn prepare_run_for_dispatch_inherits_previous_runtime_state() {
5711 let thread_store = Arc::new(InMemoryStore::new());
5712 let mut previous = seeded_waiting_run("run-prev", "thread-state", "agent-prev");
5713 previous.status = RunStatus::Done;
5714 previous.state = Some(awaken_contract::state::PersistedState {
5715 revision: 7,
5716 extensions: std::collections::HashMap::from([(
5717 "remote".to_string(),
5718 json!({"context_id": "remote-ctx-1"}),
5719 )]),
5720 });
5721 thread_store
5722 .checkpoint("thread-state", &[Message::user("first")], &previous)
5723 .await
5724 .expect("seed previous run");
5725
5726 let runtime = Arc::new(
5727 AgentRuntime::new(Arc::new(StubResolver))
5728 .with_thread_run_store(thread_store.clone() as Arc<dyn ThreadRunStore>),
5729 );
5730 let mailbox = make_mailbox_with_run_store(
5731 runtime,
5732 make_store(),
5733 thread_store.clone() as Arc<dyn ThreadRunStore>,
5734 );
5735 let mut request = RunRequest::new("thread-state", vec![Message::user("second")]);
5736 let (thread_id, messages) =
5737 validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
5738 .unwrap();
5739
5740 let run_id = mailbox
5741 .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
5742 .await
5743 .expect("precreate");
5744
5745 let run = thread_store
5746 .load_run(&run_id)
5747 .await
5748 .expect("load run")
5749 .expect("created run");
5750 assert_eq!(run.status, RunStatus::Created);
5751 assert_eq!(run.agent_id, "agent-prev");
5752 let input = run.input.as_ref().expect("run input message range");
5753 assert_eq!(input.range.unwrap().from_seq, 1);
5754 assert_eq!(input.range.unwrap().to_seq, 2);
5755 let state = run.state.expect("inherited runtime state");
5756 assert_eq!(state.revision, 7);
5757 assert_eq!(state.extensions["remote"]["context_id"], "remote-ctx-1");
5758 }
5759
5760 #[tokio::test]
5761 async fn cancel_queued_dispatch_works() {
5762 crate::metrics::install_recorder();
5763 let store = make_store();
5764 let run_store = Arc::new(InMemoryStore::new());
5765 let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
5766 let mailbox = Arc::new(Mailbox::new_with_executor(
5767 runtime,
5768 store.clone(),
5769 run_store.clone(),
5770 "test-consumer".to_string(),
5771 MailboxConfig::default(),
5772 ));
5773
5774 let result =
5775 enqueue_prepared_dispatch(&mailbox, store.as_ref(), "thread-cancel", "hello").await;
5776 let dispatch_id = result.dispatch_id.clone();
5777
5778 let cancelled = mailbox.cancel(&dispatch_id).await.unwrap();
5779 assert!(cancelled);
5780
5781 let after = store.load_dispatch(&dispatch_id).await.unwrap().unwrap();
5782 assert_eq!(after.status, RunDispatchStatus::Cancelled);
5783
5784 let run = run_store
5785 .load_run(&result.run_id)
5786 .await
5787 .unwrap()
5788 .expect("queued cancel should keep run inspectable");
5789 assert_eq!(run.status, RunStatus::Done);
5790 assert_eq!(run.termination_reason, Some(TerminationReason::Cancelled));
5791 assert_eq!(run.dispatch_id.as_deref(), Some(dispatch_id.as_str()));
5792
5793 let output = crate::metrics::render().unwrap_or_default();
5794 assert!(output.contains("operation=\"mark_run_cancelled\""));
5795 assert!(output.contains("outcome=\"cancelled\""));
5796 }
5797
5798 #[tokio::test]
5799 async fn list_dispatches_returns_entries() {
5800 let store = make_store();
5801 let runtime = make_runtime();
5802 let mailbox = make_mailbox(runtime, store.clone());
5803
5804 for i in 0..3 {
5805 let request = RunRequest::new("thread-list", vec![Message::user("msg")])
5806 .with_agent_id(format!("agent-{i}"));
5807 mailbox.submit_background(request).await.unwrap();
5808 }
5809
5810 let dispatches = mailbox
5811 .list_dispatches("thread-list", None, 100, 0)
5812 .await
5813 .unwrap();
5814 assert_eq!(dispatches.len(), 3);
5815 }
5816
5817 #[test]
5818 fn mailbox_error_display() {
5819 let e = MailboxError::Validation("test".to_string());
5820 assert_eq!(e.to_string(), "validation error: test");
5821
5822 let e = MailboxError::Internal("oops".to_string());
5823 assert_eq!(e.to_string(), "internal error: oops");
5824 }
5825
5826 #[test]
5827 fn mailbox_submit_result_fields() {
5828 let result = MailboxSubmitResult {
5829 dispatch_id: "dispatch-1".into(),
5830 run_id: "run-1".into(),
5831 thread_id: "thread-1".into(),
5832 status: MailboxDispatchStatus::Running,
5833 };
5834 assert_eq!(result.dispatch_id, "dispatch-1");
5835 assert_eq!(result.run_id, "run-1");
5836 assert_eq!(result.thread_id, "thread-1");
5837 assert!(matches!(result.status, MailboxDispatchStatus::Running));
5838 }
5839
5840 #[tokio::test]
5841 async fn suspension_aware_sink_sets_flag_on_suspended_tool_call() {
5842 use awaken_contract::contract::event_sink::{EventSink, VecEventSink};
5843 use awaken_contract::contract::suspension::ToolCallOutcome;
5844 use awaken_contract::contract::tool::{ToolResult, ToolStatus};
5845
5846 let inner: Arc<dyn EventSink> = Arc::new(VecEventSink::new());
5847 let suspended = Arc::new(AtomicBool::new(false));
5848 let sink = SuspensionAwareSink {
5849 inner: Arc::clone(&inner),
5850 suspended: Arc::clone(&suspended),
5851 };
5852
5853 sink.emit(AgentEvent::ToolCallDone {
5855 id: "c1".into(),
5856 message_id: "m1".into(),
5857 result: ToolResult {
5858 tool_name: "echo".into(),
5859 status: ToolStatus::Success,
5860 data: serde_json::json!("ok"),
5861 message: None,
5862 suspension: None,
5863 metadata: Default::default(),
5864 },
5865 outcome: ToolCallOutcome::Succeeded,
5866 })
5867 .await;
5868 assert!(!suspended.load(Ordering::Acquire));
5869
5870 sink.emit(AgentEvent::ToolCallDone {
5872 id: "c2".into(),
5873 message_id: "m2".into(),
5874 result: ToolResult {
5875 tool_name: "approve".into(),
5876 status: ToolStatus::Pending,
5877 data: serde_json::json!("pending"),
5878 message: None,
5879 suspension: None,
5880 metadata: Default::default(),
5881 },
5882 outcome: ToolCallOutcome::Suspended,
5883 })
5884 .await;
5885 assert!(suspended.load(Ordering::Acquire));
5886
5887 sink.emit(AgentEvent::ToolCallResumed {
5889 target_id: "c2".into(),
5890 result: serde_json::json!({"approved": true}),
5891 })
5892 .await;
5893 assert!(!suspended.load(Ordering::Acquire));
5894 }
5895
5896 #[test]
5899 fn classify_error_ok_is_completed() {
5900 use awaken_contract::contract::lifecycle::TerminationReason;
5901 let result = Ok(awaken_runtime::loop_runner::AgentRunResult {
5902 run_id: "run-1".to_string(),
5903 response: "done".to_string(),
5904 termination: TerminationReason::NaturalEnd,
5905 steps: 1,
5906 });
5907 assert!(matches!(
5908 classify_error(&result),
5909 MailboxRunOutcome::Completed
5910 ));
5911 }
5912
5913 #[test]
5914 fn classify_error_thread_already_running_is_permanent() {
5915 use awaken_runtime::RuntimeError;
5916 use awaken_runtime::loop_runner::AgentLoopError;
5917 let result = Err(AgentLoopError::RuntimeError(
5918 RuntimeError::ThreadAlreadyRunning {
5919 thread_id: "t1".to_string(),
5920 },
5921 ));
5922 assert!(matches!(
5923 classify_error(&result),
5924 MailboxRunOutcome::PermanentError(_)
5925 ));
5926 }
5927
5928 #[test]
5929 fn classify_error_agent_not_found_is_permanent() {
5930 use awaken_runtime::RuntimeError;
5931 use awaken_runtime::loop_runner::AgentLoopError;
5932 let result = Err(AgentLoopError::RuntimeError(RuntimeError::AgentNotFound {
5933 agent_id: "missing".to_string(),
5934 }));
5935 assert!(matches!(
5936 classify_error(&result),
5937 MailboxRunOutcome::PermanentError(_)
5938 ));
5939 }
5940
5941 #[test]
5942 fn classify_error_resolve_failed_is_permanent() {
5943 use awaken_runtime::RuntimeError;
5944 use awaken_runtime::loop_runner::AgentLoopError;
5945 let result = Err(AgentLoopError::RuntimeError(RuntimeError::ResolveFailed {
5946 message: "not found".to_string(),
5947 }));
5948 assert!(matches!(
5949 classify_error(&result),
5950 MailboxRunOutcome::PermanentError(_)
5951 ));
5952 }
5953
5954 #[test]
5955 fn classify_error_storage_error_is_transient() {
5956 use awaken_runtime::loop_runner::AgentLoopError;
5957 let result = Err(AgentLoopError::StorageError("disk full".to_string()));
5958 assert!(matches!(
5959 classify_error(&result),
5960 MailboxRunOutcome::TransientError(_)
5961 ));
5962 }
5963
5964 #[test]
5965 fn classify_error_inference_failed_is_transient() {
5966 use awaken_runtime::loop_runner::AgentLoopError;
5967 let result = Err(AgentLoopError::InferenceFailed("timeout".to_string()));
5968 assert!(matches!(
5969 classify_error(&result),
5970 MailboxRunOutcome::TransientError(_)
5971 ));
5972 }
5973
5974 #[test]
5975 fn classify_error_phase_error_is_completed() {
5976 use awaken_runtime::loop_runner::AgentLoopError;
5977 let result = Err(AgentLoopError::PhaseError(
5978 awaken_contract::StateError::UnknownKey {
5979 key: "bad".to_string(),
5980 },
5981 ));
5982 assert!(matches!(
5984 classify_error(&result),
5985 MailboxRunOutcome::Completed
5986 ));
5987 }
5988
5989 #[test]
5990 fn classify_error_invalid_resume_is_completed() {
5991 use awaken_runtime::loop_runner::AgentLoopError;
5992 let result = Err(AgentLoopError::InvalidResume("bad resume".to_string()));
5993 assert!(matches!(
5994 classify_error(&result),
5995 MailboxRunOutcome::Completed
5996 ));
5997 }
5998
5999 #[test]
6002 fn validate_run_inputs_preserves_normal_thread_id() {
6003 let (thread_id, msgs) =
6004 validate_run_inputs("my-thread".into(), vec![Message::user("hi")], false).unwrap();
6005 assert_eq!(thread_id, "my-thread");
6006 assert_eq!(msgs.len(), 1);
6007 }
6008
6009 #[test]
6010 fn validate_run_inputs_multiple_messages() {
6011 let (_, msgs) = validate_run_inputs(
6012 "t".into(),
6013 vec![Message::user("a"), Message::user("b"), Message::user("c")],
6014 false,
6015 )
6016 .unwrap();
6017 assert_eq!(msgs.len(), 3);
6018 }
6019
6020 #[test]
6021 fn validate_run_inputs_empty_string_generates_uuid() {
6022 let (thread_id, _) =
6023 validate_run_inputs("".into(), vec![Message::user("hi")], false).unwrap();
6024 assert!(!thread_id.is_empty());
6025 assert_eq!(thread_id.len(), 36);
6027 }
6028
6029 #[test]
6032 fn mailbox_config_custom_values() {
6033 let config = MailboxConfig {
6034 lease_ms: 5_000,
6035 suspended_lease_ms: 60_000,
6036 lease_renewal_interval: Duration::from_secs(2),
6037 sweep_interval: Duration::from_secs(5),
6038 gc_interval: Duration::from_secs(10),
6039 gc_ttl: Duration::from_secs(3600),
6040 default_max_attempts: 3,
6041 default_retry_delay_ms: 500,
6042 max_retry_delay_ms: 60_000,
6043 };
6044 assert_eq!(config.lease_ms, 5_000);
6045 assert_eq!(config.default_max_attempts, 3);
6046 assert_eq!(config.default_retry_delay_ms, 500);
6047 assert_eq!(config.max_retry_delay_ms, 60_000);
6048 }
6049
6050 #[tokio::test]
6053 async fn build_dispatch_sets_correct_fields() {
6054 let store = make_store();
6055 let runtime = make_runtime();
6056 let mailbox = make_mailbox(runtime, store);
6057
6058 let request =
6059 RunRequest::new("thread-42", vec![Message::user("test")]).with_run_id_hint("run-42");
6060 let dispatch = mailbox.build_dispatch(&request, "thread-42").unwrap();
6061
6062 assert_eq!(dispatch.thread_id, "thread-42");
6063 assert_eq!(dispatch.run_id, "run-42");
6064 assert_eq!(dispatch.status, RunDispatchStatus::Queued);
6065 assert_eq!(dispatch.attempt_count, 0);
6066 assert_eq!(dispatch.max_attempts, 5); assert_eq!(dispatch.priority, 128);
6068 assert_eq!(dispatch.dispatch_epoch, 0);
6069 assert!(dispatch.claim_token.is_none());
6070 assert!(dispatch.claimed_by.is_none());
6071 assert!(dispatch.lease_until.is_none());
6072 assert!(dispatch.last_error.is_none());
6073 }
6074
6075 #[test]
6076 fn build_dispatch_requires_prepared_run_id() {
6077 let store = make_store();
6078 let runtime = make_runtime();
6079 let mailbox = make_mailbox(runtime, store);
6080
6081 let request = RunRequest::new("thread-1", vec![Message::user("hi")]);
6082 assert!(mailbox.build_dispatch(&request, "thread-1").is_err());
6083 }
6084
6085 #[tokio::test]
6086 async fn prepare_run_preserves_request_extras_on_run_snapshot() {
6087 let store = make_store();
6088 let runtime = make_runtime();
6089 let thread_store = Arc::new(InMemoryStore::new());
6090 let mailbox = Arc::new(Mailbox::new(
6091 runtime,
6092 store,
6093 thread_store.clone(),
6094 "test-consumer".to_string(),
6095 MailboxConfig::default(),
6096 ));
6097
6098 let mut request = RunRequest::new("thread-ext", vec![Message::user("hi")])
6099 .with_agent_id("a1")
6100 .with_frontend_tools(vec![awaken_contract::contract::tool::ToolDescriptor::new(
6101 "ft1", "FT1", "desc",
6102 )]);
6103 let (thread_id, messages) =
6104 validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
6105 .unwrap();
6106 let run_id = mailbox
6107 .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
6108 .await
6109 .unwrap();
6110 let run = thread_store.load_run(&run_id).await.unwrap().unwrap();
6111
6112 let snapshot = run.request.expect("request snapshot");
6113 assert_eq!(snapshot.frontend_tools.len(), 1);
6114 assert!(snapshot.request_extras.is_some());
6115 }
6116
6117 #[test]
6118 fn run_request_extras_serde_roundtrip() {
6119 use awaken_contract::contract::tool::ToolDescriptor;
6120 let extras = RunRequestExtras {
6121 overrides: None,
6122 decisions: vec![],
6123 frontend_tools: vec![ToolDescriptor::new("ft1", "FT1", "desc")],
6124 continue_run_id: None,
6125 run_id_hint: None,
6126 dispatch_id_hint: None,
6127 parent_thread_id: None,
6128 transport_request_id: None,
6129 run_mode: RunMode::Scheduled,
6130 adapter: AdapterKind::Acp,
6131 };
6132 let value = extras.to_value().unwrap().unwrap();
6133 let parsed = RunRequestExtras::from_value(&value).unwrap();
6134 assert_eq!(parsed.frontend_tools.len(), 1);
6135 assert_eq!(parsed.frontend_tools[0].id, "ft1");
6136 assert!(parsed.decisions.is_empty());
6137 assert!(parsed.overrides.is_none());
6138 assert_eq!(parsed.run_mode, RunMode::Scheduled);
6139 assert_eq!(parsed.adapter, AdapterKind::Acp);
6140 }
6141
6142 #[test]
6143 fn run_request_extras_empty_returns_none() {
6144 let extras = RunRequestExtras {
6145 overrides: None,
6146 decisions: vec![],
6147 frontend_tools: vec![],
6148 continue_run_id: None,
6149 run_id_hint: None,
6150 dispatch_id_hint: None,
6151 parent_thread_id: None,
6152 transport_request_id: None,
6153 run_mode: RunMode::Foreground,
6154 adapter: AdapterKind::Internal,
6155 };
6156 assert!(extras.to_value().unwrap().is_none());
6157 }
6158
6159 #[test]
6160 fn run_request_extras_apply_to_request() {
6161 use awaken_contract::contract::tool::ToolDescriptor;
6162 let extras = RunRequestExtras {
6163 overrides: None,
6164 decisions: vec![],
6165 frontend_tools: vec![ToolDescriptor::new("ft1", "FT1", "desc")],
6166 continue_run_id: None,
6167 run_id_hint: Some("run-1".into()),
6168 dispatch_id_hint: Some("dispatch-1".into()),
6169 parent_thread_id: Some("parent-thread".into()),
6170 transport_request_id: Some("transport-1".into()),
6171 run_mode: RunMode::Resume,
6172 adapter: AdapterKind::AgUi,
6173 };
6174 let request = RunRequest::new("t1", vec![Message::user("hi")]);
6175 let applied = extras.apply_to(request);
6176 assert_eq!(applied.frontend_tools.len(), 1);
6177 assert_eq!(applied.run_id_hint.as_deref(), Some("run-1"));
6178 assert_eq!(applied.dispatch_id_hint.as_deref(), Some("dispatch-1"));
6179 assert_eq!(applied.parent_thread_id.as_deref(), Some("parent-thread"));
6180 assert_eq!(applied.transport_request_id.as_deref(), Some("transport-1"));
6181 assert_eq!(applied.run_mode, RunMode::Resume);
6182 assert_eq!(applied.adapter, AdapterKind::AgUi);
6183 }
6184
6185 #[tokio::test]
6186 async fn prepare_run_round_trips_parent_thread_id() {
6187 let store = make_store();
6188 let runtime = make_runtime();
6189 let thread_store = Arc::new(InMemoryStore::new());
6190 let mailbox = Arc::new(Mailbox::new(
6191 runtime,
6192 store,
6193 thread_store.clone(),
6194 "test-consumer".to_string(),
6195 MailboxConfig::default(),
6196 ));
6197 thread_store
6198 .save_thread(&Thread::with_id("thread-parent"))
6199 .await
6200 .unwrap();
6201
6202 let mut request = RunRequest::new("thread-child", vec![Message::user("hi")])
6203 .with_agent_id("agent")
6204 .with_parent_thread_id("thread-parent");
6205 let (thread_id, messages) =
6206 validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
6207 .unwrap();
6208 let run_id = mailbox
6209 .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
6210 .await
6211 .unwrap();
6212 let run = thread_store.load_run(&run_id).await.unwrap().unwrap();
6213
6214 assert_eq!(
6215 run.request
6216 .as_ref()
6217 .and_then(|snapshot| snapshot.parent_thread_id.as_deref()),
6218 Some("thread-parent")
6219 );
6220 }
6221
6222 #[tokio::test]
6223 async fn prepare_run_preserves_origin_metadata() {
6224 let store = make_store();
6225 let runtime = make_runtime();
6226 let thread_store = Arc::new(InMemoryStore::new());
6227 let mailbox = Arc::new(Mailbox::new(
6228 runtime,
6229 store,
6230 thread_store.clone(),
6231 "test-consumer".to_string(),
6232 MailboxConfig::default(),
6233 ));
6234
6235 let mut request = RunRequest::new("thread-meta", vec![Message::user("hi")])
6236 .with_agent_id("a1")
6237 .with_origin(RunRequestOrigin::A2A)
6238 .with_parent_run_id("parent-run-1");
6239 let (thread_id, messages) =
6240 validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
6241 .unwrap();
6242 let run_id = mailbox
6243 .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
6244 .await
6245 .unwrap();
6246 let run = thread_store.load_run(&run_id).await.unwrap().unwrap();
6247 let snapshot = run.request.as_ref().unwrap();
6248
6249 assert!(matches!(snapshot.origin, RunRequestOrigin::A2A));
6250 assert_eq!(run.parent_run_id.as_deref(), Some("parent-run-1"));
6251 }
6252
6253 #[tokio::test]
6254 async fn prepare_run_defaults_origin_to_user() {
6255 let store = make_store();
6256 let runtime = make_runtime();
6257 let thread_store = Arc::new(InMemoryStore::new());
6258 let mailbox = Arc::new(Mailbox::new(
6259 runtime,
6260 store,
6261 thread_store.clone(),
6262 "test-consumer".to_string(),
6263 MailboxConfig::default(),
6264 ));
6265
6266 let mut request = RunRequest::new("thread-default", vec![Message::user("hi")]);
6267 let (thread_id, messages) =
6268 validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
6269 .unwrap();
6270 let run_id = mailbox
6271 .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
6272 .await
6273 .unwrap();
6274 let run = thread_store.load_run(&run_id).await.unwrap().unwrap();
6275
6276 assert!(matches!(
6277 run.request.as_ref().unwrap().origin,
6278 RunRequestOrigin::User
6279 ));
6280 assert!(run.parent_run_id.is_none());
6281 }
6282
6283 #[test]
6286 fn mailbox_error_store_variant() {
6287 use awaken_contract::contract::storage::StorageError;
6288 let err: MailboxError = StorageError::NotFound("x".to_string()).into();
6289 let msg = err.to_string();
6290 assert!(msg.contains("store error"));
6291 }
6292
6293 #[test]
6296 fn mailbox_run_outcome_debug() {
6297 let completed = MailboxRunOutcome::Completed;
6298 let transient = MailboxRunOutcome::TransientError("oops".to_string());
6299 let permanent = MailboxRunOutcome::PermanentError("fatal".to_string());
6300 assert!(format!("{:?}", completed).contains("Completed"));
6301 assert!(format!("{:?}", transient).contains("oops"));
6302 assert!(format!("{:?}", permanent).contains("fatal"));
6303 }
6304
6305 #[test]
6306 fn mailbox_run_outcome_metric_labels_are_stable() {
6307 assert_eq!(MailboxRunOutcome::Completed.metric_label(), "completed");
6308 assert_eq!(
6309 MailboxRunOutcome::TransientError("retry".into()).metric_label(),
6310 "transient_error"
6311 );
6312 assert_eq!(
6313 MailboxRunOutcome::PermanentError("fatal".into()).metric_label(),
6314 "permanent_error"
6315 );
6316 }
6317
6318 #[tokio::test]
6319 async fn mailbox_execution_records_dispatch_latency_metrics() {
6320 crate::metrics::install_recorder();
6321 let mailbox_store = make_store();
6322 let run_store = Arc::new(InMemoryStore::new());
6323 let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(RecordingMailboxRuntime::default());
6324 let mailbox = Arc::new(Mailbox::new_with_executor(
6325 runtime,
6326 mailbox_store.clone(),
6327 run_store,
6328 "test-consumer".to_string(),
6329 MailboxConfig::default(),
6330 ));
6331
6332 let submitted = mailbox
6333 .submit_background(RunRequest::new("thread-metrics", vec![Message::user("go")]))
6334 .await
6335 .expect("submit should succeed");
6336
6337 wait_for_dispatch(&mailbox_store, &submitted.dispatch_id, |dispatch| {
6338 dispatch.status == RunDispatchStatus::Acked
6339 })
6340 .await;
6341
6342 let output = crate::metrics::render().unwrap_or_default();
6343 assert!(output.contains("awaken_mailbox_dispatch_enqueue_to_start_seconds"));
6344 assert!(output.contains("awaken_mailbox_dispatch_eligible_to_start_seconds"));
6345 assert!(output.contains("awaken_mailbox_dispatch_claim_to_start_seconds"));
6346 assert!(output.contains("awaken_mailbox_dispatch_enqueue_to_complete_seconds"));
6347 assert!(output.contains("awaken_mailbox_dispatch_runtime_seconds"));
6348 assert!(output.contains("awaken_runs_total"));
6349 assert!(output.contains("awaken_run_duration_seconds"));
6350 assert!(output.contains("awaken_mailbox_operations_total"));
6351 assert!(output.contains("awaken_mailbox_operation_duration_seconds"));
6352 assert!(output.contains("awaken_mailbox_dispatch_depth"));
6353 assert!(output.contains("status=\"queued\""));
6354 assert!(output.contains("operation=\"enqueue\""));
6355 assert!(output.contains("operation=\"claim\""));
6356 assert!(output.contains("operation=\"ack\""));
6357 }
6358
6359 #[tokio::test]
6360 async fn mailbox_lease_renewal_is_wired_and_prevents_reclaim() {
6361 crate::metrics::install_recorder();
6362 let mailbox_store = make_store();
6363 let run_store = Arc::new(InMemoryStore::new());
6364 let (started_tx, mut started_rx) = tokio::sync::mpsc::unbounded_channel();
6365 let release_first = Arc::new(tokio::sync::Notify::new());
6366 let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(BlockingMailboxRuntime::new(
6367 started_tx,
6368 Arc::clone(&release_first),
6369 ));
6370 let mailbox = Arc::new(Mailbox::new_with_executor(
6371 runtime,
6372 mailbox_store.clone(),
6373 run_store,
6374 "lease-metrics-consumer".to_string(),
6375 MailboxConfig {
6376 lease_ms: 80,
6377 lease_renewal_interval: Duration::from_millis(20),
6378 ..MailboxConfig::default()
6379 },
6380 ));
6381
6382 let submitted = mailbox
6383 .submit_background(RunRequest::new(
6384 "thread-lease-renewal",
6385 vec![Message::user("go")],
6386 ))
6387 .await
6388 .expect("submit should succeed");
6389 tokio::time::timeout(Duration::from_secs(2), started_rx.recv())
6390 .await
6391 .expect("runtime should start")
6392 .expect("runtime should report start");
6393
6394 let initial_lease = mailbox_store
6395 .load_dispatch(&submitted.dispatch_id)
6396 .await
6397 .expect("load dispatch")
6398 .expect("dispatch should exist")
6399 .lease_until
6400 .expect("claimed dispatch should have a lease");
6401
6402 tokio::time::timeout(Duration::from_secs(2), async {
6403 loop {
6404 let dispatch = mailbox_store
6405 .load_dispatch(&submitted.dispatch_id)
6406 .await
6407 .expect("load dispatch")
6408 .expect("dispatch should exist");
6409 if dispatch
6410 .lease_until
6411 .is_some_and(|lease| lease > initial_lease)
6412 {
6413 break;
6414 }
6415 sleep(Duration::from_millis(10)).await;
6416 }
6417 })
6418 .await
6419 .expect("lease renewal should extend the claimed dispatch");
6420
6421 let reclaimed = mailbox_store
6422 .reclaim_expired_leases(now_ms(), 10)
6423 .await
6424 .expect("manual reclaim should succeed");
6425 assert!(
6426 reclaimed.is_empty(),
6427 "renewed dispatch must not be reclaimed while runtime is active"
6428 );
6429
6430 release_first.notify_one();
6431 wait_for_dispatch(&mailbox_store, &submitted.dispatch_id, |dispatch| {
6432 dispatch.status == RunDispatchStatus::Acked
6433 })
6434 .await;
6435
6436 let output = crate::metrics::render().unwrap_or_default();
6437 assert!(output.contains("operation=\"lease_renewal\""));
6438 assert!(output.contains("result=\"ok\""));
6439 }
6440
6441 #[tokio::test]
6442 async fn background_success_records_run_result_and_keeps_dispatch_id_separate_from_run_id() {
6443 let mailbox_store = make_store();
6444 let run_store = Arc::new(InMemoryStore::new());
6445 let llm = Arc::new(ScriptedLlm::new(vec![StreamResult {
6446 content: vec![ContentBlock::text("finished")],
6447 tool_calls: vec![],
6448 usage: None,
6449 stop_reason: Some(StopReason::EndTurn),
6450 has_incomplete_tool_calls: false,
6451 }]));
6452 let resolver = Arc::new(FixedResolver {
6453 agent: ResolvedAgent::new("agent", "m", "sys", llm),
6454 plugins: vec![],
6455 });
6456 let runtime = Arc::new(AgentRuntime::new(resolver));
6457 let mailbox = Arc::new(Mailbox::new(
6458 runtime,
6459 mailbox_store.clone(),
6460 run_store,
6461 "test-consumer".to_string(),
6462 MailboxConfig::default(),
6463 ));
6464
6465 let submitted = mailbox
6466 .submit_background(
6467 RunRequest::new("thread-run-result", vec![Message::user("go")])
6468 .with_agent_id("agent"),
6469 )
6470 .await
6471 .expect("submit should succeed");
6472
6473 let acked = wait_for_dispatch(&mailbox_store, &submitted.dispatch_id, |dispatch| {
6474 dispatch.status == RunDispatchStatus::Acked
6475 && dispatch.run_status == Some(RunStatus::Done)
6476 })
6477 .await;
6478
6479 let run_id = acked.run_id.as_str();
6480 assert_ne!(
6481 run_id, submitted.dispatch_id,
6482 "default mailbox dispatch IDs must not be used as canonical run IDs"
6483 );
6484 assert!(acked.dispatch_instance_id.is_some());
6485 assert_eq!(acked.termination, Some(TerminationReason::NaturalEnd));
6486 assert_eq!(acked.run_response.as_deref(), Some("finished"));
6487 assert!(acked.run_error.is_none());
6488 assert!(acked.completed_at.is_some());
6489 }
6490
6491 #[tokio::test]
6492 async fn background_permanent_error_records_run_result_before_dead_letter() {
6493 let store = make_store();
6494 let runtime = make_runtime();
6495 let mailbox = make_mailbox(runtime, store.clone());
6496
6497 let submitted = mailbox
6498 .submit_background(
6499 RunRequest::new("thread-missing-agent", vec![Message::user("go")])
6500 .with_agent_id("missing-agent"),
6501 )
6502 .await
6503 .expect("submit should succeed");
6504
6505 let dead = wait_for_dispatch(&store, &submitted.dispatch_id, |dispatch| {
6506 dispatch.status == RunDispatchStatus::DeadLetter
6507 && dispatch.run_status == Some(RunStatus::Done)
6508 && dispatch.run_error.is_some()
6509 })
6510 .await;
6511
6512 let run_id = dead.run_id.as_str();
6513 assert_ne!(
6514 run_id, submitted.dispatch_id,
6515 "synthetic terminal events must preserve canonical run id instead of reusing dispatch id"
6516 );
6517 assert!(dead.dispatch_instance_id.is_some());
6518 assert!(matches!(
6519 dead.termination,
6520 Some(TerminationReason::Error(ref message)) if message.contains("missing-agent")
6521 ));
6522 assert!(
6523 dead.last_error
6524 .as_deref()
6525 .is_some_and(|error| error.contains("missing-agent"))
6526 );
6527 assert!(
6528 dead.run_error
6529 .as_deref()
6530 .is_some_and(|error| error.contains("missing-agent"))
6531 );
6532 assert!(dead.completed_at.is_some());
6533 }
6534
6535 #[tokio::test]
6536 async fn reconstruct_failure_cleans_worker_and_dispatches_next_queued() {
6537 let store = make_store();
6538 let thread_store = Arc::new(InMemoryStore::new());
6539 let runtime = Arc::new(RecordingMailboxRuntime::default());
6540 let mailbox = Arc::new(Mailbox::new_with_executor(
6541 runtime,
6542 store.clone(),
6543 thread_store.clone(),
6544 "test-consumer".to_string(),
6545 MailboxConfig::default(),
6546 ));
6547 let thread_id = "thread-reconstruct-next";
6548 let now = now_ms();
6549
6550 let missing = RunDispatch {
6551 dispatch_id: "dispatch-missing-run".to_string(),
6552 thread_id: thread_id.to_string(),
6553 run_id: "missing-run".to_string(),
6554 priority: 10,
6555 dedupe_key: None,
6556 dispatch_epoch: 0,
6557 status: RunDispatchStatus::Queued,
6558 available_at: now,
6559 attempt_count: 0,
6560 max_attempts: 3,
6561 last_error: None,
6562 claim_token: None,
6563 claimed_by: None,
6564 lease_until: None,
6565 dispatch_instance_id: None,
6566 run_status: None,
6567 termination: None,
6568 run_response: None,
6569 run_error: None,
6570 completed_at: None,
6571 created_at: now,
6572 updated_at: now,
6573 };
6574 store.enqueue(&missing).await.expect("enqueue missing run");
6575
6576 let mut next_request =
6577 RunRequest::new(thread_id, vec![Message::user("next")]).with_agent_id("agent");
6578 let (_, next_messages) = validate_run_inputs(
6579 next_request.thread_id.clone(),
6580 next_request.messages.clone(),
6581 false,
6582 )
6583 .expect("next input should validate");
6584 mailbox
6585 .prepare_run_for_dispatch(&mut next_request, thread_id, &next_messages)
6586 .await
6587 .expect("prepare next run");
6588 let mut next = mailbox
6589 .build_dispatch(&next_request, thread_id)
6590 .expect("build next dispatch");
6591 next.priority = 20;
6592 next.created_at = now.saturating_add(1);
6593 next.updated_at = next.created_at;
6594 let next_dispatch_id = next.dispatch_id.clone();
6595 store.enqueue(&next).await.expect("enqueue next");
6596
6597 mailbox.get_or_create_worker(thread_id).await;
6598 assert_eq!(
6599 mailbox.try_dispatch_next(thread_id).await,
6600 DispatchAttempt::Claimed
6601 );
6602
6603 let dead = wait_for_dispatch(&store, "dispatch-missing-run", |dispatch| {
6604 dispatch.status == RunDispatchStatus::DeadLetter
6605 })
6606 .await;
6607 assert_eq!(dead.status, RunDispatchStatus::DeadLetter);
6608
6609 let acked = wait_for_dispatch(&store, &next_dispatch_id, |dispatch| {
6610 dispatch.status == RunDispatchStatus::Acked
6611 })
6612 .await;
6613 assert_eq!(acked.status, RunDispatchStatus::Acked);
6614 }
6615
6616 #[test]
6619 fn dispatch_status_queued_zero() {
6620 let running = MailboxDispatchStatus::Running;
6621 let status = MailboxDispatchStatus::Queued;
6622 assert!(matches!(running, MailboxDispatchStatus::Running));
6623 assert!(matches!(status, MailboxDispatchStatus::Queued));
6624 }
6625
6626 #[tokio::test]
6629 async fn interrupt_bumps_dispatch_epoch() {
6630 let store = make_store();
6631 let runtime = make_runtime();
6632 let mailbox = make_mailbox(runtime, store.clone());
6633
6634 let request =
6636 RunRequest::new("thread-int", vec![Message::user("a")]).with_agent_id("agent-1");
6637 mailbox.submit_background(request).await.unwrap();
6638
6639 let result = mailbox.interrupt("thread-int").await.unwrap();
6640 assert!(result.new_dispatch_epoch > 0);
6642 }
6643
6644 #[tokio::test]
6645 async fn interrupt_marks_superseded_queued_runs_cancelled() {
6646 crate::metrics::install_recorder();
6647 let store = make_store();
6648 let run_store = Arc::new(InMemoryStore::new());
6649 let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
6650 let mailbox = Arc::new(Mailbox::new_with_executor(
6651 runtime,
6652 store.clone(),
6653 run_store.clone(),
6654 "test-consumer".to_string(),
6655 MailboxConfig::default(),
6656 ));
6657
6658 let first =
6659 enqueue_prepared_dispatch(&mailbox, store.as_ref(), "thread-int-runs", "first").await;
6660 let second =
6661 enqueue_prepared_dispatch(&mailbox, store.as_ref(), "thread-int-runs", "second").await;
6662
6663 let result = mailbox.interrupt_detailed("thread-int-runs").await.unwrap();
6664 assert_eq!(result.superseded_count, 2);
6665 assert_eq!(result.superseded_dispatches.len(), 2);
6666
6667 for submitted in [&first, &second] {
6668 let dispatch = store
6669 .load_dispatch(&submitted.dispatch_id)
6670 .await
6671 .unwrap()
6672 .expect("superseded dispatch should remain inspectable");
6673 assert_eq!(dispatch.status, RunDispatchStatus::Superseded);
6674
6675 let run = run_store
6676 .load_run(&submitted.run_id)
6677 .await
6678 .unwrap()
6679 .expect("superseded run should remain inspectable");
6680 assert_eq!(run.status, RunStatus::Done);
6681 assert_eq!(run.termination_reason, Some(TerminationReason::Cancelled));
6682 assert_eq!(
6683 run.dispatch_id.as_deref(),
6684 Some(submitted.dispatch_id.as_str())
6685 );
6686 }
6687
6688 let output = crate::metrics::render().unwrap_or_default();
6689 assert!(output.contains("operation=\"mark_run_superseded\""));
6690 assert!(output.contains("outcome=\"superseded\""));
6691 }
6692
6693 #[tokio::test]
6694 async fn foreground_submit_marks_prior_queued_run_cancelled() {
6695 let store = make_store();
6696 let run_store = Arc::new(InMemoryStore::new());
6697 let runtime = Arc::new(CountingMailboxRuntime::default());
6698 let mailbox = Arc::new(Mailbox::new_with_executor(
6699 runtime,
6700 store.clone(),
6701 run_store.clone(),
6702 "foreground-consumer".to_string(),
6703 MailboxConfig::default(),
6704 ));
6705
6706 let old =
6707 enqueue_prepared_dispatch(&mailbox, store.as_ref(), "thread-submit-supersede", "old")
6708 .await;
6709
6710 let (_new_result, _events) = mailbox
6711 .submit(
6712 RunRequest::new("thread-submit-supersede", vec![Message::user("new")])
6713 .with_agent_id("agent"),
6714 )
6715 .await
6716 .expect("foreground submit should claim replacement dispatch");
6717
6718 let old_dispatch = store
6719 .load_dispatch(&old.dispatch_id)
6720 .await
6721 .unwrap()
6722 .expect("old dispatch should remain inspectable");
6723 assert_eq!(old_dispatch.status, RunDispatchStatus::Superseded);
6724
6725 let old_run = run_store
6726 .load_run(&old.run_id)
6727 .await
6728 .unwrap()
6729 .expect("old run should remain inspectable");
6730 assert_eq!(old_run.status, RunStatus::Done);
6731 assert_eq!(
6732 old_run.termination_reason,
6733 Some(TerminationReason::Cancelled)
6734 );
6735 assert_eq!(
6736 old_run.dispatch_id.as_deref(),
6737 Some(old.dispatch_id.as_str())
6738 );
6739 }
6740
6741 #[tokio::test]
6742 async fn submit_inline_claim_empty_cancels_precreated_run() {
6743 crate::metrics::install_recorder();
6744 let store = Arc::new(SignalMailboxStore::with_empty_claim_dispatch_once());
6745 let run_store = Arc::new(InMemoryStore::new());
6746 let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
6747 let mailbox = Arc::new(Mailbox::new_with_executor(
6748 runtime,
6749 store.clone(),
6750 run_store.clone(),
6751 "inline-empty-consumer".to_string(),
6752 MailboxConfig::default(),
6753 ));
6754
6755 let error = match mailbox
6756 .submit(
6757 RunRequest::new("thread-inline-empty", vec![Message::user("go")])
6758 .with_agent_id("agent"),
6759 )
6760 .await
6761 {
6762 Ok(_) => panic!("inline submit should fail when claim_dispatch returns empty"),
6763 Err(error) => error,
6764 };
6765 assert!(error.to_string().contains(ACTIVE_RUN_CONFLICT_MESSAGE));
6766
6767 let dispatches = store
6768 .inner
6769 .list_dispatches("thread-inline-empty", None, 10, 0)
6770 .await
6771 .expect("list inline cleanup dispatches");
6772 assert_eq!(dispatches.len(), 1);
6773 let dispatch = &dispatches[0];
6774 assert_eq!(dispatch.status, RunDispatchStatus::Cancelled);
6775
6776 let run = run_store
6777 .load_run(&dispatch.run_id)
6778 .await
6779 .unwrap()
6780 .expect("inline cleanup should keep run inspectable");
6781 assert_eq!(run.status, RunStatus::Done);
6782 assert_eq!(run.termination_reason, Some(TerminationReason::Cancelled));
6783 assert_eq!(
6784 run.dispatch_id.as_deref(),
6785 Some(dispatch.dispatch_id.as_str())
6786 );
6787
6788 let output = crate::metrics::render().unwrap_or_default();
6789 assert!(output.contains("operation=\"mark_run_cancelled\""));
6790 assert!(output.contains("outcome=\"cancelled\""));
6791 }
6792
6793 #[tokio::test]
6794 async fn recover_reconciles_terminal_cancelled_and_superseded_dispatches_after_crash() {
6795 crate::metrics::install_recorder();
6796 let store = make_store();
6797 let run_store = Arc::new(InMemoryStore::new());
6798 let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
6799 let mailbox = Arc::new(Mailbox::new_with_executor(
6800 runtime,
6801 store.clone(),
6802 run_store.clone(),
6803 "reconcile-consumer".to_string(),
6804 MailboxConfig::default(),
6805 ));
6806
6807 let cancelled = enqueue_prepared_dispatch(
6808 &mailbox,
6809 store.as_ref(),
6810 "thread-reconcile-cancel",
6811 "cancel",
6812 )
6813 .await;
6814 let superseded = enqueue_prepared_dispatch(
6815 &mailbox,
6816 store.as_ref(),
6817 "thread-reconcile-supersede",
6818 "supersede",
6819 )
6820 .await;
6821
6822 store
6823 .cancel(&cancelled.dispatch_id, now_ms())
6824 .await
6825 .expect("direct cancel should terminalize dispatch");
6826 store
6827 .interrupt("thread-reconcile-supersede", now_ms())
6828 .await
6829 .expect("direct interrupt should supersede dispatch");
6830
6831 for submitted in [&cancelled, &superseded] {
6832 let before = run_store
6833 .load_run(&submitted.run_id)
6834 .await
6835 .unwrap()
6836 .expect("prepared run should exist before reconciliation");
6837 assert_eq!(before.status, RunStatus::Created);
6838 assert!(before.dispatch_id.is_none());
6839 }
6840
6841 let recovered = mailbox.recover().await.expect("recover should reconcile");
6842 assert_eq!(recovered, 0);
6843
6844 for submitted in [&cancelled, &superseded] {
6845 let run = run_store
6846 .load_run(&submitted.run_id)
6847 .await
6848 .unwrap()
6849 .expect("reconciled run should remain inspectable");
6850 assert_eq!(run.status, RunStatus::Done);
6851 assert_eq!(run.termination_reason, Some(TerminationReason::Cancelled));
6852 assert_eq!(
6853 run.dispatch_id.as_deref(),
6854 Some(submitted.dispatch_id.as_str())
6855 );
6856 }
6857
6858 let output = crate::metrics::render().unwrap_or_default();
6859 assert!(output.contains("operation=\"list_terminal_dispatches\""));
6860 assert!(output.contains("operation=\"reconcile_terminal_dispatch\""));
6861 }
6862
6863 #[tokio::test]
6864 async fn reclaim_dead_letter_marks_run_error() {
6865 crate::metrics::install_recorder();
6866 let store = make_store();
6867 let run_store = Arc::new(InMemoryStore::new());
6868 let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
6869 let mailbox = Arc::new(Mailbox::new_with_executor(
6870 runtime,
6871 store.clone(),
6872 run_store.clone(),
6873 "test-consumer".to_string(),
6874 MailboxConfig::default(),
6875 ));
6876
6877 let mut dispatch = prepare_queued_dispatch(&mailbox, "thread-reclaim-dead", "expire").await;
6878 dispatch.max_attempts = 1;
6879 dispatch.available_at = 1000;
6880 let dispatch_id = dispatch.dispatch_id.clone();
6881 let run_id = dispatch.run_id.clone();
6882 store.enqueue(&dispatch).await.expect("enqueue dispatch");
6883 let claimed = store
6884 .claim("thread-reclaim-dead", "stale-consumer", 100, 1000, 1)
6885 .await
6886 .expect("claim dispatch");
6887 assert_eq!(claimed.len(), 1);
6888
6889 mailbox
6890 .recover()
6891 .await
6892 .expect("recover should reclaim expired lease");
6893
6894 let dead_letter = store
6895 .load_dispatch(&dispatch_id)
6896 .await
6897 .unwrap()
6898 .expect("dead-lettered dispatch should remain inspectable");
6899 assert_eq!(dead_letter.status, RunDispatchStatus::DeadLetter);
6900
6901 let run = run_store
6902 .load_run(&run_id)
6903 .await
6904 .unwrap()
6905 .expect("dead-lettered run should remain inspectable");
6906 assert_eq!(run.status, RunStatus::Done);
6907 assert!(
6908 matches!(run.termination_reason, Some(TerminationReason::Error(_))),
6909 "dead-lettered dispatch should mark the run as errored"
6910 );
6911 assert_eq!(run.dispatch_id.as_deref(), Some(dispatch_id.as_str()));
6912
6913 let output = crate::metrics::render().unwrap_or_default();
6914 assert!(output.contains("operation=\"mark_run_dead_letter\""));
6915 assert!(output.contains("outcome=\"dead_letter\""));
6916 }
6917
6918 #[tokio::test]
6919 async fn sweep_reconciles_dead_letter_dispatch_after_crash() {
6920 let store = make_store();
6921 let run_store = Arc::new(InMemoryStore::new());
6922 let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
6923 let mailbox = Arc::new(Mailbox::new_with_executor(
6924 runtime,
6925 store.clone(),
6926 run_store.clone(),
6927 "sweep-reconcile-consumer".to_string(),
6928 MailboxConfig::default(),
6929 ));
6930
6931 let mut dispatch =
6932 prepare_queued_dispatch(&mailbox, "thread-sweep-reconcile-dead", "dead").await;
6933 dispatch.available_at = 1;
6934 let dispatch_id = dispatch.dispatch_id.clone();
6935 let run_id = dispatch.run_id.clone();
6936 store.enqueue(&dispatch).await.expect("enqueue dispatch");
6937 let claimed = store
6938 .claim(
6939 "thread-sweep-reconcile-dead",
6940 "stale-consumer",
6941 100,
6942 now_ms(),
6943 1,
6944 )
6945 .await
6946 .expect("claim dispatch");
6947 let claim_token = claimed[0]
6948 .claim_token
6949 .as_deref()
6950 .expect("claimed dispatch should have a claim token")
6951 .to_string();
6952 store
6953 .dead_letter(
6954 &dispatch_id,
6955 &claim_token,
6956 "crashed after dead_letter",
6957 now_ms(),
6958 )
6959 .await
6960 .expect("direct dead_letter should terminalize dispatch");
6961
6962 let before = run_store
6963 .load_run(&run_id)
6964 .await
6965 .unwrap()
6966 .expect("prepared run should exist before sweep reconciliation");
6967 assert_eq!(before.status, RunStatus::Created);
6968
6969 mailbox.run_sweep().await;
6970
6971 let run = run_store
6972 .load_run(&run_id)
6973 .await
6974 .unwrap()
6975 .expect("reconciled run should remain inspectable");
6976 assert_eq!(run.status, RunStatus::Done);
6977 assert!(
6978 matches!(run.termination_reason, Some(TerminationReason::Error(_))),
6979 "dead-lettered dispatch should reconcile the run as errored"
6980 );
6981 assert_eq!(run.dispatch_id.as_deref(), Some(dispatch_id.as_str()));
6982 }
6983
6984 #[tokio::test]
6987 async fn submit_returns_event_channel() {
6988 let store = make_store();
6989 let runtime = make_runtime();
6990 let mailbox = make_mailbox(runtime, store.clone());
6991
6992 let request =
6993 RunRequest::new("thread-stream", vec![Message::user("hi")]).with_agent_id("agent-1");
6994 let (result, _event_rx) = mailbox.submit(request).await.unwrap();
6995
6996 assert_eq!(result.thread_id, "thread-stream");
6997 assert!(!result.dispatch_id.is_empty());
6998 assert!(matches!(
6999 result.status,
7000 MailboxDispatchStatus::Running | MailboxDispatchStatus::Queued
7001 ));
7002 }
7003
7004 #[tokio::test]
7005 async fn live_then_queue_steers_active_run_without_new_dispatch() {
7006 let store = Arc::new(InMemoryStore::new());
7007 let mailbox_store = make_store();
7008 let requests = Arc::new(StdMutex::new(Vec::new()));
7009 let (started_tx, started_rx) = tokio::sync::oneshot::channel();
7010 let (release_tx, release_rx) = tokio::sync::oneshot::channel();
7011 let llm = Arc::new(RecordingLlm::new(
7012 vec![
7013 StreamResult {
7014 content: vec![ContentBlock::text("start tool")],
7015 tool_calls: vec![ToolCall::new("block-1", "block", json!({}))],
7016 usage: None,
7017 stop_reason: Some(StopReason::ToolUse),
7018 has_incomplete_tool_calls: false,
7019 },
7020 StreamResult {
7021 content: vec![ContentBlock::text("saw live input")],
7022 tool_calls: vec![],
7023 usage: None,
7024 stop_reason: Some(StopReason::EndTurn),
7025 has_incomplete_tool_calls: false,
7026 },
7027 ],
7028 requests.clone(),
7029 ));
7030 let resolver = Arc::new(FixedResolver {
7031 agent: ResolvedAgent::new("agent", "m", "sys", llm)
7032 .with_tool(Arc::new(BlockingTool::new(started_tx, release_rx))),
7033 plugins: vec![],
7034 });
7035 let runtime = Arc::new(
7036 AgentRuntime::new(resolver)
7037 .with_thread_run_store(store.clone() as Arc<dyn ThreadRunStore>)
7038 .with_mailbox_store(mailbox_store.clone()),
7039 );
7040 let mailbox = make_mailbox_with_run_store(
7041 runtime,
7042 mailbox_store.clone(),
7043 store.clone() as Arc<dyn ThreadRunStore>,
7044 );
7045
7046 let first = mailbox
7047 .submit_background(
7048 RunRequest::new("thread-live-steer", vec![Message::user("start")])
7049 .with_agent_id("agent"),
7050 )
7051 .await
7052 .expect("initial submit should start");
7053
7054 tokio::time::timeout(Duration::from_secs(1), started_rx)
7055 .await
7056 .expect("tool should start")
7057 .expect("started signal should send");
7058
7059 let steered = mailbox
7060 .submit_live_then_queue(
7061 RunRequest::new("thread-live-steer", vec![Message::user("live steer")])
7062 .with_agent_id("agent"),
7063 None,
7064 )
7065 .await
7066 .expect("live steer should be accepted");
7067 assert_eq!(steered.status, MailboxDispatchStatus::Running);
7068 assert_eq!(steered.run_id, first.run_id);
7069 assert_eq!(steered.dispatch_id, first.dispatch_id);
7070
7071 let _ = release_tx.send(());
7072 let latest = wait_for_latest_run(&store, "thread-live-steer", |run| {
7073 run.status == RunStatus::Done
7074 })
7075 .await;
7076 assert_eq!(latest.run_id, first.run_id);
7077
7078 let live_message_seen = {
7079 let recorded = requests.lock().expect("lock poisoned");
7080 assert_eq!(recorded.len(), 2);
7081 recorded[1].messages.iter().any(|message| {
7082 message.text() == "live steer"
7083 && message.role == awaken_contract::contract::message::Role::User
7084 && message.visibility == awaken_contract::contract::message::Visibility::All
7085 })
7086 };
7087 assert!(
7088 live_message_seen,
7089 "second LLM turn should receive the live user message"
7090 );
7091
7092 let messages = store
7093 .load_messages("thread-live-steer")
7094 .await
7095 .expect("load messages")
7096 .expect("messages should be persisted");
7097 assert_eq!(
7098 messages
7099 .iter()
7100 .filter(|message| message.text() == "live steer")
7101 .count(),
7102 1,
7103 "live message should be persisted exactly once"
7104 );
7105
7106 let dispatches = mailbox_store
7107 .list_dispatches("thread-live-steer", None, 10, 0)
7108 .await
7109 .expect("list dispatches");
7110 assert_eq!(
7111 dispatches
7112 .iter()
7113 .filter(|dispatch| dispatch.run_id == first.run_id)
7114 .count(),
7115 1,
7116 "live steering must not create an extra dispatch"
7117 );
7118 }
7119
7120 #[tokio::test]
7121 async fn live_then_queue_falls_back_to_durable_dispatch_when_receiver_unavailable() {
7122 let mailbox_store = make_store();
7123 let thread_store = Arc::new(InMemoryStore::new());
7124 let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
7125 let mailbox = Arc::new(Mailbox::new_with_executor(
7126 runtime,
7127 mailbox_store.clone(),
7128 thread_store.clone(),
7129 "test-consumer".to_string(),
7130 MailboxConfig::default(),
7131 ));
7132 let thread_id = "thread-live-fallback";
7133 let mut active_request =
7134 RunRequest::new(thread_id, vec![Message::user("active")]).with_agent_id("agent");
7135 let (_, active_messages) = validate_run_inputs(
7136 active_request.thread_id.clone(),
7137 active_request.messages.clone(),
7138 false,
7139 )
7140 .expect("active input should validate");
7141 mailbox
7142 .prepare_run_for_dispatch(&mut active_request, thread_id, &active_messages)
7143 .await
7144 .expect("prepare active run");
7145 let active_dispatch = mailbox
7146 .build_dispatch(&active_request, thread_id)
7147 .expect("build active dispatch");
7148 let active_dispatch_id = active_dispatch.dispatch_id.clone();
7149 mailbox_store
7150 .enqueue(&active_dispatch)
7151 .await
7152 .expect("enqueue active dispatch");
7153 mailbox_store
7154 .claim_dispatch(&active_dispatch_id, "test-consumer", 30_000, now_ms())
7155 .await
7156 .expect("claim active dispatch")
7157 .expect("active dispatch should be claimed");
7158 let worker = mailbox.get_or_create_worker(thread_id).await;
7159 {
7160 let mut worker = worker.lock();
7161 worker.status = MailboxWorkerStatus::Running {
7162 dispatch_id: active_dispatch_id.clone(),
7163 run_id: active_dispatch.run_id.clone(),
7164 lease_handle: tokio::spawn(async {}),
7165 sink: Arc::new(ReconnectableEventSink::new(mpsc::channel(16).0)),
7166 };
7167 }
7168
7169 let result = mailbox
7170 .submit_live_then_queue(
7171 RunRequest::new(thread_id, vec![Message::user("fallback")]).with_agent_id("agent"),
7172 None,
7173 )
7174 .await
7175 .expect("fallback submit should succeed");
7176
7177 assert_eq!(result.status, MailboxDispatchStatus::Queued);
7178 assert_ne!(result.dispatch_id, active_dispatch_id);
7179 let messages = thread_store
7180 .load_messages(thread_id)
7181 .await
7182 .expect("load messages")
7183 .expect("messages should exist");
7184 assert_eq!(
7185 messages
7186 .iter()
7187 .filter(|message| message.text() == "fallback")
7188 .count(),
7189 1,
7190 "fallback message should be persisted once"
7191 );
7192 let queued = mailbox_store
7193 .list_dispatches(thread_id, Some(&[RunDispatchStatus::Queued]), 10, 0)
7194 .await
7195 .expect("list queued");
7196 assert_eq!(queued.len(), 1);
7197 assert_eq!(queued[0].dispatch_id, result.dispatch_id);
7198 }
7199
7200 #[tokio::test]
7201 async fn foreground_submit_sends_live_cancel_for_remote_active_dispatch() {
7202 use awaken_contract::contract::mailbox::LiveRunCommand;
7203 use futures::StreamExt;
7204
7205 let mailbox_store = make_store();
7206 let thread_store = Arc::new(InMemoryStore::new());
7207 let runtime = Arc::new(RecordingMailboxRuntime::default());
7208 let mailbox = Arc::new(Mailbox::new_with_executor(
7209 runtime,
7210 mailbox_store.clone(),
7211 thread_store.clone(),
7212 "foreground-consumer".to_string(),
7213 MailboxConfig::default(),
7214 ));
7215 let thread_id = "thread-remote-foreground";
7216
7217 let mut active_request =
7218 RunRequest::new(thread_id, vec![Message::user("active")]).with_agent_id("agent");
7219 let (_, active_messages) = validate_run_inputs(
7220 active_request.thread_id.clone(),
7221 active_request.messages.clone(),
7222 false,
7223 )
7224 .expect("active input should validate");
7225 mailbox
7226 .prepare_run_for_dispatch(&mut active_request, thread_id, &active_messages)
7227 .await
7228 .expect("prepare active run");
7229 let active_dispatch = mailbox
7230 .build_dispatch(&active_request, thread_id)
7231 .expect("build active dispatch");
7232 let active_dispatch_id = active_dispatch.dispatch_id.clone();
7233 mailbox_store
7234 .enqueue(&active_dispatch)
7235 .await
7236 .expect("enqueue active dispatch");
7237 let claimed = mailbox_store
7238 .claim_dispatch(&active_dispatch_id, "remote-consumer", 30_000, now_ms())
7239 .await
7240 .expect("claim active dispatch")
7241 .expect("active dispatch should be claimed");
7242 let active_claim_token = claimed.claim_token.clone().expect("claim token");
7243
7244 let subscriber = mailbox_store
7245 .open_live_channel_for(&live_target_for_dispatch(&active_dispatch))
7246 .await
7247 .expect("open live channel");
7248 let captured = Arc::new(tokio::sync::Mutex::new(Vec::<LiveRunCommand>::new()));
7249 let captured_clone = captured.clone();
7250 let store_clone = mailbox_store.clone();
7251 let active_dispatch_id_clone = active_dispatch_id.clone();
7252 let active_claim_token_clone = active_claim_token.clone();
7253 let _forwarder = tokio::spawn(async move {
7254 let mut subscriber = subscriber;
7255 while let Some(entry) = subscriber.next().await {
7256 captured_clone.lock().await.push(entry.command.clone());
7257 if matches!(entry.command, LiveRunCommand::Cancel) {
7258 let release_result = store_clone
7259 .ack(
7260 &active_dispatch_id_clone,
7261 &active_claim_token_clone,
7262 now_ms(),
7263 )
7264 .await;
7265 if let Err(error) = release_result {
7266 assert!(
7267 matches!(error, StorageError::VersionConflict { .. }),
7268 "remote run release should either ack or be superseded, got {error:?}"
7269 );
7270 }
7271 entry.receipt.ack();
7272 break;
7273 }
7274 drop(entry.receipt);
7275 }
7276 });
7277
7278 let (submitted, _events) = mailbox
7279 .submit(
7280 RunRequest::new(thread_id, vec![Message::user("replacement")])
7281 .with_agent_id("agent"),
7282 )
7283 .await
7284 .expect("foreground submit should cancel remote active run and claim replacement");
7285
7286 assert_eq!(submitted.status, MailboxDispatchStatus::Running);
7287 assert_ne!(submitted.dispatch_id, active_dispatch_id);
7288 let commands = captured.lock().await;
7289 assert!(
7290 commands
7291 .iter()
7292 .any(|command| matches!(command, LiveRunCommand::Cancel)),
7293 "foreground submit must deliver live Cancel to the remote active run"
7294 );
7295 }
7296
7297 #[tokio::test]
7298 async fn foreground_submit_does_not_prepare_replacement_when_remote_cancel_times_out() {
7299 use awaken_contract::contract::mailbox::LiveRunCommand;
7300 use futures::StreamExt;
7301
7302 let mailbox_store = make_store();
7303 let thread_store = Arc::new(InMemoryStore::new());
7304 let runtime = Arc::new(RecordingMailboxRuntime::default());
7305 let mailbox = Arc::new(Mailbox::new_with_executor(
7306 runtime,
7307 mailbox_store.clone(),
7308 thread_store.clone(),
7309 "foreground-consumer".to_string(),
7310 MailboxConfig::default(),
7311 ));
7312 let thread_id = "thread-remote-cancel-timeout";
7313
7314 let mut active_request =
7315 RunRequest::new(thread_id, vec![Message::user("active")]).with_agent_id("agent");
7316 let (_, active_messages) = validate_run_inputs(
7317 active_request.thread_id.clone(),
7318 active_request.messages.clone(),
7319 false,
7320 )
7321 .expect("active input should validate");
7322 mailbox
7323 .prepare_run_for_dispatch(&mut active_request, thread_id, &active_messages)
7324 .await
7325 .expect("prepare active run");
7326 let active_dispatch = mailbox
7327 .build_dispatch(&active_request, thread_id)
7328 .expect("build active dispatch");
7329 let active_dispatch_id = active_dispatch.dispatch_id.clone();
7330 mailbox_store.enqueue(&active_dispatch).await.unwrap();
7331 mailbox_store
7332 .claim_dispatch(&active_dispatch_id, "remote-consumer", 30_000, now_ms())
7333 .await
7334 .unwrap()
7335 .expect("active dispatch should be claimed");
7336
7337 let subscriber = mailbox_store
7338 .open_live_channel_for(&live_target_for_dispatch(&active_dispatch))
7339 .await
7340 .expect("open live channel");
7341 let _forwarder = tokio::spawn(async move {
7342 let mut subscriber = subscriber;
7343 while let Some(entry) = subscriber.next().await {
7344 if matches!(entry.command, LiveRunCommand::Cancel) {
7345 entry.receipt.ack();
7347 break;
7348 }
7349 drop(entry.receipt);
7350 }
7351 });
7352
7353 let result = mailbox
7354 .submit(
7355 RunRequest::new(thread_id, vec![Message::user("replacement")])
7356 .with_agent_id("agent"),
7357 )
7358 .await;
7359 assert!(
7360 matches!(result, Err(MailboxError::Validation(ref message)) if message == ACTIVE_RUN_CONFLICT_MESSAGE),
7361 "foreground submit must fail before writing replacement state when old claim remains active"
7362 );
7363
7364 let messages = thread_store
7365 .load_messages(thread_id)
7366 .await
7367 .expect("load messages")
7368 .expect("active messages should remain");
7369 assert_eq!(messages.len(), 1);
7370 assert_eq!(messages[0].text(), "active");
7371
7372 let all = mailbox_store
7373 .list_dispatches(thread_id, None, 10, 0)
7374 .await
7375 .expect("list dispatches");
7376 assert_eq!(all.len(), 1);
7377 assert_eq!(all[0].dispatch_id, active_dispatch_id);
7378 assert_eq!(all[0].status, RunDispatchStatus::Claimed);
7379 }
7380
7381 #[tokio::test]
7382 async fn foreground_submit_does_not_prepare_replacement_when_local_cancel_times_out() {
7383 let mailbox_store = make_store();
7384 let thread_store = Arc::new(InMemoryStore::new());
7385 let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
7386 let mailbox = Arc::new(Mailbox::new_with_executor(
7387 runtime,
7388 mailbox_store.clone(),
7389 thread_store.clone(),
7390 "foreground-consumer".to_string(),
7391 MailboxConfig::default(),
7392 ));
7393 let thread_id = "thread-local-cancel-timeout";
7394
7395 let mut active_request =
7396 RunRequest::new(thread_id, vec![Message::user("active")]).with_agent_id("agent");
7397 let (_, active_messages) = validate_run_inputs(
7398 active_request.thread_id.clone(),
7399 active_request.messages.clone(),
7400 false,
7401 )
7402 .expect("active input should validate");
7403 mailbox
7404 .prepare_run_for_dispatch(&mut active_request, thread_id, &active_messages)
7405 .await
7406 .expect("prepare active run");
7407 let active_dispatch = mailbox
7408 .build_dispatch(&active_request, thread_id)
7409 .expect("build active dispatch");
7410 let active_dispatch_id = active_dispatch.dispatch_id.clone();
7411 mailbox_store.enqueue(&active_dispatch).await.unwrap();
7412 mailbox_store
7413 .claim_dispatch(&active_dispatch_id, "foreground-consumer", 30_000, now_ms())
7414 .await
7415 .unwrap()
7416 .expect("active dispatch should be claimed");
7417
7418 let result = mailbox
7419 .submit(
7420 RunRequest::new(thread_id, vec![Message::user("replacement")])
7421 .with_agent_id("agent"),
7422 )
7423 .await;
7424 assert!(
7425 matches!(result, Err(MailboxError::Validation(ref message)) if message == ACTIVE_RUN_CONFLICT_MESSAGE),
7426 "foreground submit must fail before writing replacement state when local cancel does not release"
7427 );
7428
7429 let messages = thread_store
7430 .load_messages(thread_id)
7431 .await
7432 .expect("load messages")
7433 .expect("active messages should remain");
7434 assert_eq!(messages.len(), 1);
7435 assert_eq!(messages[0].text(), "active");
7436
7437 let all = mailbox_store
7438 .list_dispatches(thread_id, None, 10, 0)
7439 .await
7440 .expect("list dispatches");
7441 assert_eq!(all.len(), 1);
7442 assert_eq!(all[0].dispatch_id, active_dispatch_id);
7443 assert_eq!(all[0].status, RunDispatchStatus::Claimed);
7444 }
7445
7446 #[tokio::test]
7447 async fn foreground_submit_waits_for_local_cancelled_dispatch_to_release_claim() {
7448 let mailbox_store = make_store();
7449 let thread_store = Arc::new(InMemoryStore::new());
7450 let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(ImmediateLocalCancelRuntime);
7451 let mailbox = Arc::new(Mailbox::new_with_executor(
7452 runtime,
7453 mailbox_store.clone(),
7454 thread_store.clone(),
7455 "foreground-consumer".to_string(),
7456 MailboxConfig::default(),
7457 ));
7458 let thread_id = "thread-local-cancel-claim-window";
7459
7460 let mut active_request =
7461 RunRequest::new(thread_id, vec![Message::user("active")]).with_agent_id("agent");
7462 let (_, active_messages) = validate_run_inputs(
7463 active_request.thread_id.clone(),
7464 active_request.messages.clone(),
7465 false,
7466 )
7467 .expect("active input should validate");
7468 mailbox
7469 .prepare_run_for_dispatch(&mut active_request, thread_id, &active_messages)
7470 .await
7471 .expect("prepare active run");
7472 let active_dispatch = mailbox
7473 .build_dispatch(&active_request, thread_id)
7474 .expect("build active dispatch");
7475 let active_dispatch_id = active_dispatch.dispatch_id.clone();
7476 mailbox_store.enqueue(&active_dispatch).await.unwrap();
7477 mailbox_store
7478 .claim_dispatch(&active_dispatch_id, "foreground-consumer", 30_000, now_ms())
7479 .await
7480 .unwrap()
7481 .expect("active dispatch should be claimed");
7482
7483 let result = mailbox
7484 .submit(
7485 RunRequest::new(thread_id, vec![Message::user("replacement")])
7486 .with_agent_id("agent"),
7487 )
7488 .await;
7489 assert!(
7490 matches!(result, Err(MailboxError::Validation(ref message)) if message == ACTIVE_RUN_CONFLICT_MESSAGE),
7491 "foreground submit must fail before writing replacement state when local runtime slot released but mailbox claim remains"
7492 );
7493
7494 let messages = thread_store
7495 .load_messages(thread_id)
7496 .await
7497 .expect("load messages")
7498 .expect("active messages should remain");
7499 assert_eq!(messages.len(), 1);
7500 assert_eq!(messages[0].text(), "active");
7501
7502 let all = mailbox_store
7503 .list_dispatches(thread_id, None, 10, 0)
7504 .await
7505 .expect("list dispatches");
7506 assert_eq!(all.len(), 1);
7507 assert_eq!(all[0].dispatch_id, active_dispatch_id);
7508 assert_eq!(all[0].status, RunDispatchStatus::Claimed);
7509 }
7510
7511 #[tokio::test]
7516 async fn live_then_queue_publishes_for_remote_active_run() {
7517 use awaken_contract::contract::mailbox::LiveRunCommand;
7518 use futures::StreamExt;
7519
7520 let mailbox_store = make_store();
7521 let thread_store = Arc::new(InMemoryStore::new());
7522 let thread_id = "thread-remote";
7523 let remote_run_id = "run-remote";
7524
7525 let mut run = seeded_waiting_run(remote_run_id, thread_id, "agent");
7527 run.status = RunStatus::Running;
7528 thread_store
7529 .create_run(&run)
7530 .await
7531 .expect("seed remote run");
7532
7533 let subscriber = mailbox_store
7536 .open_live_channel_for(&live_target_for_run(&run))
7537 .await
7538 .expect("open live channel");
7539 let captured = std::sync::Arc::new(tokio::sync::Mutex::new(Vec::<LiveRunCommand>::new()));
7540 let captured_clone = captured.clone();
7541 let _forwarder = tokio::spawn(async move {
7542 let mut subscriber = subscriber;
7543 while let Some(entry) = subscriber.next().await {
7544 captured_clone.lock().await.push(entry.command.clone());
7545 entry.receipt.ack();
7546 }
7547 });
7548
7549 let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
7550 let mailbox = Arc::new(Mailbox::new_with_executor(
7551 runtime,
7552 mailbox_store.clone(),
7553 thread_store.clone(),
7554 "test-consumer".to_string(),
7555 MailboxConfig::default(),
7556 ));
7557
7558 let result = mailbox
7559 .submit_live_then_queue(
7560 RunRequest::new(thread_id, vec![Message::user("steer-remote")])
7561 .with_agent_id("agent"),
7562 None,
7563 )
7564 .await
7565 .expect("submit should succeed");
7566
7567 assert_eq!(result.status, MailboxDispatchStatus::Running);
7568 assert_eq!(result.run_id, remote_run_id);
7569
7570 let commands = captured.lock().await;
7572 assert_eq!(commands.len(), 1);
7573 match &commands[0] {
7574 LiveRunCommand::Messages(msgs) => assert_eq!(msgs[0].text(), "steer-remote"),
7575 other => panic!("expected Messages, got {other:?}"),
7576 }
7577 drop(commands);
7578
7579 let queued = mailbox_store
7581 .list_dispatches(thread_id, Some(&[RunDispatchStatus::Queued]), 10, 0)
7582 .await
7583 .expect("list queued");
7584 assert!(
7585 queued.is_empty(),
7586 "cross-node live delivery must not create a dispatch"
7587 );
7588 }
7589
7590 #[tokio::test]
7594 async fn live_then_queue_falls_back_when_subscriber_drops_receipt() {
7595 use futures::StreamExt;
7596
7597 let mailbox_store = make_store();
7598 let thread_store = Arc::new(InMemoryStore::new());
7599 let thread_id = "thread-dropped-receipt";
7600
7601 let mut run = seeded_waiting_run("run-dropped", thread_id, "agent");
7602 run.status = RunStatus::Running;
7603 thread_store.create_run(&run).await.expect("seed run");
7604
7605 let subscriber = mailbox_store
7606 .open_live_channel_for(&live_target_for_run(&run))
7607 .await
7608 .expect("open live channel");
7609 let _rogue = tokio::spawn(async move {
7611 let mut subscriber = subscriber;
7612 while let Some(entry) = subscriber.next().await {
7613 drop(entry.receipt);
7614 }
7615 });
7616
7617 let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
7618 let mailbox = Arc::new(Mailbox::new_with_executor(
7619 runtime,
7620 mailbox_store.clone(),
7621 thread_store.clone(),
7622 "test-consumer".to_string(),
7623 MailboxConfig::default(),
7624 ));
7625
7626 let result = mailbox
7627 .submit_live_then_queue(
7628 RunRequest::new(thread_id, vec![Message::user("hello?")]).with_agent_id("agent"),
7629 None,
7630 )
7631 .await
7632 .expect("submit should succeed via queue fallback");
7633
7634 let dispatches = mailbox_store
7635 .list_dispatches(thread_id, None, 10, 0)
7636 .await
7637 .expect("list dispatches");
7638 assert_eq!(
7639 dispatches.len(),
7640 1,
7641 "unacked receipt must force a durable dispatch"
7642 );
7643 assert_eq!(result.dispatch_id, dispatches[0].dispatch_id);
7644 }
7645
7646 #[tokio::test]
7653 async fn live_then_queue_is_at_least_once_when_ack_lost() {
7654 use futures::StreamExt;
7655
7656 let mailbox_store = make_store();
7657 let thread_store = Arc::new(InMemoryStore::new());
7658 let thread_id = "thread-ack-lost";
7659
7660 let mut run = seeded_waiting_run("run-ack-lost", thread_id, "agent");
7661 run.status = RunStatus::Running;
7662 thread_store.create_run(&run).await.expect("seed run");
7663
7664 let subscriber = mailbox_store
7665 .open_live_channel_for(&live_target_for_run(&run))
7666 .await
7667 .expect("open live channel");
7668 let accepted = std::sync::Arc::new(tokio::sync::Mutex::new(Vec::<String>::new()));
7671 let accepted_c = accepted.clone();
7672 let _consumer = tokio::spawn(async move {
7673 let mut subscriber = subscriber;
7674 while let Some(entry) = subscriber.next().await {
7675 if let awaken_contract::contract::mailbox::LiveRunCommand::Messages(ref msgs) =
7676 entry.command
7677 {
7678 for m in msgs {
7679 accepted_c.lock().await.push(m.text());
7680 }
7681 }
7682 drop(entry.receipt);
7684 }
7685 });
7686
7687 let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
7688 let mailbox = Arc::new(Mailbox::new_with_executor(
7689 runtime,
7690 mailbox_store.clone(),
7691 thread_store.clone(),
7692 "test-consumer".to_string(),
7693 MailboxConfig::default(),
7694 ));
7695
7696 let result = mailbox
7697 .submit_live_then_queue(
7698 RunRequest::new(thread_id, vec![Message::user("steer-payload")])
7699 .with_agent_id("agent"),
7700 None,
7701 )
7702 .await
7703 .expect("submit should succeed via queue fallback");
7704
7705 let received = accepted.lock().await.clone();
7707 assert_eq!(
7708 received.as_slice(),
7709 &["steer-payload".to_string()],
7710 "forwarder must have observed the live command before dropping receipt"
7711 );
7712
7713 let dispatches = mailbox_store
7717 .list_dispatches(thread_id, None, 10, 0)
7718 .await
7719 .expect("list dispatches");
7720 assert_eq!(dispatches.len(), 1);
7721 assert_eq!(result.dispatch_id, dispatches[0].dispatch_id);
7722 }
7723
7724 #[tokio::test]
7728 async fn live_then_queue_rejects_remote_mismatched_expected_run_id() {
7729 let mailbox_store = make_store();
7730 let thread_store = Arc::new(InMemoryStore::new());
7731 let thread_id = "thread-mismatch";
7732
7733 let mut run = seeded_waiting_run("run-actual", thread_id, "agent");
7734 run.status = RunStatus::Running;
7735 thread_store.create_run(&run).await.expect("seed run");
7736
7737 let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
7738 let mailbox = Arc::new(Mailbox::new_with_executor(
7739 runtime,
7740 mailbox_store.clone(),
7741 thread_store.clone(),
7742 "test-consumer".to_string(),
7743 MailboxConfig::default(),
7744 ));
7745
7746 let result = mailbox
7747 .submit_live_then_queue(
7748 RunRequest::new(thread_id, vec![Message::user("wrong-run")]).with_agent_id("agent"),
7749 Some("run-stale"),
7750 )
7751 .await
7752 .expect("submit should succeed");
7753
7754 assert_ne!(
7755 result.run_id, "run-actual",
7756 "mismatched expected_run_id must not steer the stale remote run"
7757 );
7758 }
7759
7760 #[tokio::test]
7761 async fn send_decision_live_delivers_to_remote_waiting_run() {
7762 use awaken_contract::contract::mailbox::LiveRunCommand;
7763 use futures::StreamExt;
7764
7765 let mailbox_store = make_store();
7766 let thread_store = Arc::new(InMemoryStore::new());
7767 let thread_id = "thread-remote-decision";
7768 let run = seeded_waiting_run("run-remote-decision", thread_id, "agent");
7769 thread_store.create_run(&run).await.expect("seed run");
7770
7771 let subscriber = mailbox_store
7772 .open_live_channel_for(&live_target_for_run(&run))
7773 .await
7774 .expect("open targeted live channel");
7775 let captured = Arc::new(tokio::sync::Mutex::new(Vec::new()));
7776 let captured_c = captured.clone();
7777 let _forwarder = tokio::spawn(async move {
7778 let mut subscriber = subscriber;
7779 while let Some(entry) = subscriber.next().await {
7780 if let LiveRunCommand::Decision(decisions) = entry.command {
7781 captured_c.lock().await.push(decisions);
7782 entry.receipt.ack();
7783 break;
7784 }
7785 drop(entry.receipt);
7786 }
7787 });
7788
7789 let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
7790 let mailbox = Arc::new(Mailbox::new_with_executor(
7791 runtime,
7792 mailbox_store,
7793 thread_store,
7794 "test-consumer".to_string(),
7795 MailboxConfig::default(),
7796 ));
7797
7798 let delivered = mailbox
7799 .send_decision_live(thread_id, "tool-1".to_string(), make_resume())
7800 .await
7801 .expect("live decision should not error");
7802 assert!(delivered);
7803 let captured = captured.lock().await;
7804 assert_eq!(captured.len(), 1);
7805 assert_eq!(captured[0][0].0, "tool-1");
7806 }
7807
7808 #[tokio::test]
7812 async fn live_then_queue_falls_back_to_queue_when_no_remote_subscriber() {
7813 let mailbox_store = make_store();
7814 let thread_store = Arc::new(InMemoryStore::new());
7815 let thread_id = "thread-no-subscriber";
7816
7817 let mut run = seeded_waiting_run("run-no-listener", thread_id, "agent");
7820 run.status = RunStatus::Running;
7821 thread_store.create_run(&run).await.expect("seed run");
7822
7823 let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
7824 let mailbox = Arc::new(Mailbox::new_with_executor(
7825 runtime,
7826 mailbox_store.clone(),
7827 thread_store.clone(),
7828 "test-consumer".to_string(),
7829 MailboxConfig::default(),
7830 ));
7831
7832 let result = mailbox
7833 .submit_live_then_queue(
7834 RunRequest::new(thread_id, vec![Message::user("hello?")]).with_agent_id("agent"),
7835 None,
7836 )
7837 .await
7838 .expect("submit should succeed via queue fallback");
7839
7840 let all_dispatches = mailbox_store
7845 .list_dispatches(thread_id, None, 10, 0)
7846 .await
7847 .expect("list dispatches");
7848 assert_eq!(
7849 all_dispatches.len(),
7850 1,
7851 "no-subscriber cross-node must fall back to durable queue"
7852 );
7853 assert_eq!(result.dispatch_id, all_dispatches[0].dispatch_id);
7854 }
7855
7856 #[tokio::test]
7857 async fn waiting_thread_is_reactivated_by_incoming_message() {
7858 let store = Arc::new(InMemoryStore::new());
7859 store
7860 .create_run(&seeded_waiting_run(
7861 "run-waiting",
7862 "thread-waiting",
7863 "agent",
7864 ))
7865 .await
7866 .expect("seed waiting run");
7867
7868 let llm = Arc::new(ScriptedLlm::new(vec![StreamResult {
7869 content: vec![ContentBlock::text("reactivated")],
7870 tool_calls: vec![],
7871 usage: None,
7872 stop_reason: Some(StopReason::EndTurn),
7873 has_incomplete_tool_calls: false,
7874 }]));
7875 let resolver = Arc::new(FixedResolver {
7876 agent: ResolvedAgent::new("agent", "m", "sys", llm),
7877 plugins: vec![],
7878 });
7879 let runtime = Arc::new(
7880 AgentRuntime::new(resolver)
7881 .with_thread_run_store(store.clone() as Arc<dyn ThreadRunStore>),
7882 );
7883 let mailbox_store = make_store();
7884 let mailbox = make_mailbox_with_run_store(
7885 runtime,
7886 mailbox_store,
7887 store.clone() as Arc<dyn ThreadRunStore>,
7888 );
7889
7890 let submitted = mailbox
7891 .submit_background(
7892 RunRequest::new("thread-waiting", vec![Message::user("poke")])
7893 .with_agent_id("agent"),
7894 )
7895 .await
7896 .expect("submit should succeed");
7897 assert_eq!(submitted.run_id, "run-waiting");
7898
7899 let latest = wait_for_latest_run(&store, "thread-waiting", |run| {
7900 run.status == RunStatus::Done && run.updated_at > 1
7901 })
7902 .await;
7903
7904 assert_eq!(
7905 latest.run_id, "run-waiting",
7906 "incoming messages should continue the existing waiting run"
7907 );
7908 assert_eq!(latest.status, RunStatus::Done);
7909 }
7910
7911 #[tokio::test]
7912 async fn structured_user_input_waiting_thread_is_reused_by_incoming_message() {
7913 let store = Arc::new(InMemoryStore::new());
7914 let mut waiting = seeded_waiting_run("run-user-input", "thread-user-input", "agent");
7915 waiting.waiting = Some(RunWaitingState {
7916 reason: WaitingReason::UserInput,
7917 ticket_ids: Vec::new(),
7918 tickets: Vec::new(),
7919 since_dispatch_id: None,
7920 message: Some("waiting for user input".to_string()),
7921 });
7922 store.create_run(&waiting).await.expect("seed waiting run");
7923
7924 let llm = Arc::new(ScriptedLlm::new(vec![StreamResult {
7925 content: vec![ContentBlock::text("continued")],
7926 tool_calls: vec![],
7927 usage: None,
7928 stop_reason: Some(StopReason::EndTurn),
7929 has_incomplete_tool_calls: false,
7930 }]));
7931 let resolver = Arc::new(FixedResolver {
7932 agent: ResolvedAgent::new("agent", "m", "sys", llm),
7933 plugins: vec![],
7934 });
7935 let runtime = Arc::new(
7936 AgentRuntime::new(resolver)
7937 .with_thread_run_store(store.clone() as Arc<dyn ThreadRunStore>),
7938 );
7939 let mailbox = Arc::new(Mailbox::new(
7940 runtime,
7941 make_store(),
7942 store.clone(),
7943 "test-consumer".to_string(),
7944 MailboxConfig::default(),
7945 ));
7946
7947 let submitted = mailbox
7948 .submit_background(
7949 RunRequest::new("thread-user-input", vec![Message::user("continue")])
7950 .with_agent_id("agent"),
7951 )
7952 .await
7953 .expect("submit should succeed");
7954
7955 assert_eq!(
7956 submitted.run_id, "run-user-input",
7957 "structured user-input waiting should keep the same user-intent run"
7958 );
7959 }
7960
7961 #[tokio::test]
7962 async fn reusable_waiting_run_prefers_thread_open_run_projection_over_latest_run() {
7963 let store = Arc::new(InMemoryStore::new());
7964 let thread_id = "thread-open-projection";
7965 let mut open = seeded_waiting_run("run-open", thread_id, "agent");
7966 open.waiting = Some(RunWaitingState {
7967 reason: WaitingReason::UserInput,
7968 ticket_ids: Vec::new(),
7969 tickets: Vec::new(),
7970 since_dispatch_id: None,
7971 message: Some("waiting for explicit input".to_string()),
7972 });
7973 open.updated_at = 10;
7974 let mut newer = seeded_waiting_run("run-newer-latest", thread_id, "agent");
7975 newer.updated_at = 20;
7976
7977 store.create_run(&open).await.expect("seed open run");
7978 store.create_run(&newer).await.expect("seed newer run");
7979 let mut thread = Thread::with_id(thread_id);
7980 thread.open_run_id = Some(open.run_id.clone());
7981 store
7982 .save_thread(&thread)
7983 .await
7984 .expect("save thread projection");
7985
7986 let runtime = Arc::new(RecordingStoreMailboxRuntime::new(store.clone()));
7987 let mailbox = Arc::new(Mailbox::new(
7988 runtime.clone(),
7989 make_store(),
7990 store.clone(),
7991 "test-consumer".to_string(),
7992 MailboxConfig::default(),
7993 ));
7994
7995 let submitted = mailbox
7996 .submit_background(
7997 RunRequest::new(thread_id, vec![Message::user("continue open")])
7998 .with_agent_id("agent"),
7999 )
8000 .await
8001 .expect("submit should succeed");
8002
8003 assert_eq!(
8004 submitted.run_id, "run-open",
8005 "thread.open_run_id must win over latest_run() when resuming same user intent"
8006 );
8007 let deadline = Instant::now() + Duration::from_secs(1);
8008 loop {
8009 if !runtime.requests.lock().expect("lock poisoned").is_empty() {
8010 break;
8011 }
8012 assert!(Instant::now() < deadline, "request was not dispatched");
8013 sleep(Duration::from_millis(5)).await;
8014 }
8015 let requests = runtime.requests.lock().expect("lock poisoned");
8016 assert_eq!(requests[0].continue_run_id.as_deref(), Some("run-open"));
8017 }
8018
8019 #[tokio::test]
8020 async fn recover_only_enqueues_orphaned_background_task_waiting_runs() {
8021 let store = Arc::new(InMemoryStore::new());
8022 let mut background = seeded_waiting_run("run-bg", "thread-bg-recover", "agent");
8023 background.waiting = Some(RunWaitingState {
8024 reason: WaitingReason::BackgroundTasks,
8025 ticket_ids: Vec::new(),
8026 tickets: Vec::new(),
8027 since_dispatch_id: None,
8028 message: None,
8029 });
8030 store.create_run(&background).await.expect("seed bg run");
8031
8032 let mut user_input = seeded_waiting_run("run-user", "thread-user-recover", "agent");
8033 user_input.waiting = Some(RunWaitingState {
8034 reason: WaitingReason::UserInput,
8035 ticket_ids: Vec::new(),
8036 tickets: Vec::new(),
8037 since_dispatch_id: None,
8038 message: Some("waiting for user".to_string()),
8039 });
8040 store
8041 .create_run(&user_input)
8042 .await
8043 .expect("seed user-input run");
8044
8045 let mailbox_store = make_store();
8046 let runtime = Arc::new(RecordingStoreMailboxRuntime::new(store.clone()));
8047 let mailbox = Arc::new(Mailbox::new(
8048 runtime.clone(),
8049 mailbox_store.clone(),
8050 store.clone(),
8051 "test-consumer".to_string(),
8052 MailboxConfig::default(),
8053 ));
8054
8055 let recovered = mailbox.recover().await.expect("recover should succeed");
8056 assert_eq!(recovered, 1);
8057
8058 let deadline = Instant::now() + Duration::from_secs(1);
8059 loop {
8060 if runtime.requests.lock().expect("lock poisoned").len() == 1 {
8061 break;
8062 }
8063 assert!(Instant::now() < deadline, "recover did not dispatch wake");
8064 sleep(Duration::from_millis(5)).await;
8065 }
8066
8067 {
8068 let requests = runtime.requests.lock().expect("lock poisoned");
8069 assert_eq!(requests.len(), 1);
8070 assert_eq!(requests[0].thread_id, "thread-bg-recover");
8071 assert_eq!(requests[0].continue_run_id.as_deref(), Some("run-bg"));
8072 assert_eq!(requests[0].run_mode, RunMode::InternalWake);
8073 assert_eq!(requests[0].adapter, AdapterKind::Internal);
8074 }
8075
8076 let user_dispatches = mailbox_store
8077 .list_dispatches("thread-user-recover", None, 10, 0)
8078 .await
8079 .expect("list user dispatches");
8080 assert!(
8081 user_dispatches.is_empty(),
8082 "user-input waiting runs must stay suspended until explicit input"
8083 );
8084 }
8085
8086 #[tokio::test]
8087 async fn background_task_completion_should_enqueue_internal_wake_message() {
8088 let store = Arc::new(InMemoryStore::new());
8089 let mailbox_store = make_store();
8090 let manager = Arc::new(BackgroundTaskManager::new());
8091
8092 let llm = Arc::new(ScriptedLlm::new(vec![
8093 StreamResult {
8094 content: vec![ContentBlock::text("spawning task")],
8095 tool_calls: vec![ToolCall::new("c1", "spawn_bg", json!({}))],
8096 usage: None,
8097 stop_reason: Some(StopReason::ToolUse),
8098 has_incomplete_tool_calls: false,
8099 },
8100 StreamResult {
8101 content: vec![ContentBlock::text("waiting for background task")],
8102 tool_calls: vec![],
8103 usage: None,
8104 stop_reason: Some(StopReason::EndTurn),
8105 has_incomplete_tool_calls: false,
8106 },
8107 ]));
8108 let agent = ResolvedAgent::new("agent", "m", "sys", llm).with_tool(Arc::new(
8109 SpawnShortBgTaskTool {
8110 manager: manager.clone(),
8111 delay: Duration::from_millis(25),
8112 },
8113 ));
8114 let resolver = Arc::new(FixedResolver {
8115 agent,
8116 plugins: vec![Arc::new(BackgroundTaskPlugin::new(manager))],
8117 });
8118 let runtime = Arc::new(
8119 AgentRuntime::new(resolver)
8120 .with_thread_run_store(store.clone() as Arc<dyn ThreadRunStore>),
8121 );
8122 let mailbox = make_mailbox_with_run_store(
8123 runtime,
8124 mailbox_store.clone(),
8125 store.clone() as Arc<dyn ThreadRunStore>,
8126 );
8127
8128 mailbox
8129 .submit_background(
8130 RunRequest::new("thread-bg", vec![Message::user("start")]).with_agent_id("agent"),
8131 )
8132 .await
8133 .expect("submit should succeed");
8134
8135 let waiting =
8136 wait_for_latest_run(&store, "thread-bg", |run| run.is_background_task_waiting()).await;
8137 sleep(Duration::from_millis(100)).await;
8138
8139 let dispatches = mailbox_store
8140 .list_dispatches("thread-bg", None, 10, 0)
8141 .await
8142 .expect("list dispatches should succeed");
8143
8144 assert!(
8145 dispatches.len() >= 2,
8146 "background completion should enqueue an internal wake message; waiting run was {:?}, dispatches were {:?}",
8147 waiting,
8148 dispatches
8149 );
8150 let messages = store
8151 .load_messages("thread-bg")
8152 .await
8153 .expect("load messages")
8154 .unwrap_or_default();
8155 assert!(
8156 messages.iter().any(|msg| {
8157 msg.role == awaken_contract::contract::message::Role::User
8158 && msg.visibility == awaken_contract::contract::message::Visibility::Internal
8159 && msg.text().contains("<background-task-event")
8160 && msg.text().contains("\"done\":true")
8161 }),
8162 "expected a synthetic background wake message after task completion"
8163 );
8164 }
8165
8166 #[test]
8169 fn send_decision_unknown_id_returns_false() {
8170 let store = make_store();
8171 let runtime = make_runtime();
8172 let mailbox = make_mailbox(runtime, store);
8173
8174 let result = mailbox.send_decision(
8175 "nonexistent",
8176 "tc-1".to_string(),
8177 ToolCallResume {
8178 decision_id: "d1".into(),
8179 action: awaken_contract::contract::suspension::ResumeDecisionAction::Resume,
8180 result: serde_json::json!({"approved": true}),
8181 reason: None,
8182 updated_at: 0,
8183 },
8184 );
8185 assert!(!result);
8186 }
8187
8188 #[tokio::test]
8191 async fn concurrent_submit_background_same_thread_only_one_runs() {
8192 let store = make_store();
8193 let runtime = make_runtime();
8194 let mailbox = make_mailbox(runtime, store.clone());
8195
8196 let mut handles = Vec::new();
8198 for i in 0..5 {
8199 let mb = Arc::clone(&mailbox);
8200 handles.push(tokio::spawn(async move {
8201 let req = RunRequest::new("thread-conc", vec![Message::user(format!("msg-{i}"))])
8202 .with_agent_id("agent-1");
8203 mb.submit_background(req).await
8204 }));
8205 }
8206 let results: Vec<_> = futures::future::join_all(handles)
8207 .await
8208 .into_iter()
8209 .map(|r| r.unwrap())
8210 .collect();
8211
8212 assert!(results.iter().all(|r| r.is_ok()));
8214
8215 let running_count = results
8217 .iter()
8218 .filter_map(|r| r.as_ref().ok())
8219 .filter(|r| matches!(r.status, MailboxDispatchStatus::Running))
8220 .count();
8221 assert!(
8222 running_count <= 1,
8223 "at most 1 should be Running, got {running_count}"
8224 );
8225
8226 let dispatches = store
8228 .list_dispatches("thread-conc", Some(&[RunDispatchStatus::Claimed]), 10, 0)
8229 .await
8230 .unwrap();
8231 assert!(
8232 dispatches.len() <= 1,
8233 "store should have at most 1 Claimed dispatch, got {}",
8234 dispatches.len()
8235 );
8236 }
8237
8238 #[tokio::test]
8239 async fn concurrent_submit_same_thread_only_one_claims() {
8240 let store = make_store();
8241 let runtime = make_runtime();
8242 let mailbox = make_mailbox(runtime, store.clone());
8243
8244 let mut handles = Vec::new();
8246 for i in 0..3 {
8247 let mb = Arc::clone(&mailbox);
8248 handles.push(tokio::spawn(async move {
8249 let req = RunRequest::new(
8250 "thread-stream-conc",
8251 vec![Message::user(format!("msg-{i}"))],
8252 )
8253 .with_agent_id("agent-1");
8254 mb.submit(req).await
8255 }));
8256 }
8257 let results: Vec<_> = futures::future::join_all(handles)
8258 .await
8259 .into_iter()
8260 .map(|r| r.unwrap())
8261 .collect();
8262
8263 let ok_count = results.iter().filter(|r| r.is_ok()).count();
8265 assert!(ok_count >= 1, "at least 1 should succeed");
8266
8267 let dispatches = store
8269 .list_dispatches(
8270 "thread-stream-conc",
8271 Some(&[RunDispatchStatus::Claimed]),
8272 10,
8273 0,
8274 )
8275 .await
8276 .unwrap();
8277 assert!(
8278 dispatches.len() <= 1,
8279 "at most 1 Claimed, got {}",
8280 dispatches.len()
8281 );
8282 }
8283
8284 #[tokio::test]
8285 async fn interrupt_between_claim_and_execution_supersedes_without_runtime_start() {
8286 crate::metrics::install_recorder();
8287 let store = Arc::new(InterruptOnLoadMailboxStore::new());
8288 let run_store = Arc::new(InMemoryStore::new());
8289 let runtime = Arc::new(CountingMailboxRuntime::default());
8290 let mailbox = Arc::new(Mailbox::new_with_executor(
8291 runtime.clone(),
8292 store.clone(),
8293 run_store.clone(),
8294 "epoch-race-consumer".to_string(),
8295 MailboxConfig {
8296 lease_ms: 100,
8297 lease_renewal_interval: Duration::from_millis(20),
8298 ..MailboxConfig::default()
8299 },
8300 ));
8301
8302 let result = mailbox
8303 .submit_background(
8304 RunRequest::new("thread-epoch-race", vec![Message::user("go")])
8305 .with_agent_id("agent"),
8306 )
8307 .await
8308 .expect("submit should succeed");
8309
8310 tokio::time::timeout(Duration::from_secs(2), async {
8311 loop {
8312 if let Some(dispatch) = store.load_dispatch(&result.dispatch_id).await.unwrap()
8313 && dispatch.status == RunDispatchStatus::Superseded
8314 {
8315 break;
8316 }
8317 sleep(Duration::from_millis(10)).await;
8318 }
8319 })
8320 .await
8321 .expect("dispatch should be superseded promptly");
8322
8323 assert_eq!(
8324 runtime.run_count(),
8325 0,
8326 "stale dispatch must not enter runtime"
8327 );
8328 let loaded = store
8329 .load_dispatch(&result.dispatch_id)
8330 .await
8331 .unwrap()
8332 .expect("dispatch should remain inspectable");
8333 assert_eq!(loaded.status, RunDispatchStatus::Superseded);
8334 assert!(loaded.claim_token.is_none());
8335 assert!(loaded.lease_until.is_none());
8336
8337 let run = run_store
8338 .load_run(&result.run_id)
8339 .await
8340 .unwrap()
8341 .expect("prepared run should remain inspectable");
8342 assert_eq!(run.status, RunStatus::Done);
8343 assert_eq!(run.termination_reason, Some(TerminationReason::Cancelled));
8344 assert_eq!(
8345 run.dispatch_id.as_deref(),
8346 Some(result.dispatch_id.as_str())
8347 );
8348
8349 let output = crate::metrics::render().unwrap_or_default();
8350 assert!(output.contains("operation=\"load_dispatch\""));
8351 assert!(output.contains("operation=\"current_dispatch_epoch\""));
8352 assert!(output.contains("operation=\"supersede_claimed\""));
8353 assert!(output.contains("operation=\"mark_run_superseded\""));
8354 }
8355
8356 #[tokio::test]
8357 async fn dispatch_signal_busy_ack_still_runs_queued_dispatch_after_current_finishes() {
8358 let store = Arc::new(SignalMailboxStore::new());
8359 let run_store = Arc::new(InMemoryStore::new());
8360 let (started_tx, mut started_rx) = tokio::sync::mpsc::unbounded_channel();
8361 let release_first = Arc::new(tokio::sync::Notify::new());
8362 let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(BlockingMailboxRuntime::new(
8363 started_tx,
8364 Arc::clone(&release_first),
8365 ));
8366 let mailbox = Arc::new(Mailbox::new_with_executor(
8367 runtime,
8368 store.clone(),
8369 run_store,
8370 "signal-consumer".to_string(),
8371 MailboxConfig::default(),
8372 ));
8373
8374 let mut first = RunRequest::new("thread-signal-busy", vec![Message::user("first")])
8375 .with_agent_id("agent");
8376 let (thread_id, first_messages) =
8377 validate_run_inputs(first.thread_id.clone(), first.messages.clone(), false)
8378 .expect("first input should validate");
8379 mailbox
8380 .prepare_run_for_dispatch(&mut first, &thread_id, &first_messages)
8381 .await
8382 .expect("prepare first run");
8383 let first_dispatch = mailbox
8384 .build_dispatch(&first, &thread_id)
8385 .expect("build first dispatch");
8386 let first_dispatch_id = first_dispatch.dispatch_id.clone();
8387 store.enqueue(&first_dispatch).await.expect("enqueue first");
8388
8389 let mut second = RunRequest::new("thread-signal-busy", vec![Message::user("second")])
8390 .with_agent_id("agent");
8391 let (_, second_messages) =
8392 validate_run_inputs(second.thread_id.clone(), second.messages.clone(), false)
8393 .expect("second input should validate");
8394 mailbox
8395 .prepare_run_for_dispatch(&mut second, &thread_id, &second_messages)
8396 .await
8397 .expect("prepare second run");
8398 let second_dispatch = mailbox
8399 .build_dispatch(&second, &thread_id)
8400 .expect("build second dispatch");
8401 let second_dispatch_id = second_dispatch.dispatch_id.clone();
8402 store
8403 .enqueue(&second_dispatch)
8404 .await
8405 .expect("enqueue second");
8406
8407 let signal_loop = tokio::spawn(Arc::clone(&mailbox).run_dispatch_signal_loop());
8408 let (ordinal, dispatch_id) =
8409 tokio::time::timeout(Duration::from_secs(2), started_rx.recv())
8410 .await
8411 .expect("first dispatch should start")
8412 .expect("runtime should report first start");
8413 assert_eq!(ordinal, 1);
8414 let blocked_dispatch_id = dispatch_id.expect("started dispatch should have an id");
8415 assert!(
8416 blocked_dispatch_id == first_dispatch_id || blocked_dispatch_id == second_dispatch_id,
8417 "started dispatch must be one of the two queued dispatches"
8418 );
8419 let queued_dispatch_id = if blocked_dispatch_id == first_dispatch_id {
8420 second_dispatch_id.as_str()
8421 } else {
8422 first_dispatch_id.as_str()
8423 };
8424
8425 let deadline = Instant::now() + Duration::from_secs(2);
8426 while store.acked_signal_count() < 2 {
8427 assert!(
8428 Instant::now() < deadline,
8429 "signal loop must ack the busy second signal instead of blocking"
8430 );
8431 sleep(Duration::from_millis(10)).await;
8432 }
8433 assert_eq!(store.nacked_signal_count(), 0);
8434 let queued_before_release = store
8435 .load_dispatch(queued_dispatch_id)
8436 .await
8437 .expect("load queued dispatch")
8438 .expect("queued dispatch exists");
8439 assert_eq!(
8440 queued_before_release.status,
8441 RunDispatchStatus::Queued,
8442 "busy signal ack must not claim the other dispatch before the first finishes"
8443 );
8444
8445 release_first.notify_waiters();
8446 let (ordinal, dispatch_id) =
8447 tokio::time::timeout(Duration::from_secs(2), started_rx.recv())
8448 .await
8449 .expect("queued dispatch should start after first finishes")
8450 .expect("runtime should report second start");
8451 assert_eq!(ordinal, 2);
8452 assert_eq!(dispatch_id.as_deref(), Some(queued_dispatch_id));
8453
8454 let first_done = wait_for_dispatch(&store.inner, &first_dispatch_id, |dispatch| {
8455 dispatch.status == RunDispatchStatus::Acked
8456 })
8457 .await;
8458 let second_done = wait_for_dispatch(&store.inner, &second_dispatch_id, |dispatch| {
8459 dispatch.status == RunDispatchStatus::Acked
8460 })
8461 .await;
8462 signal_loop.abort();
8463
8464 assert_eq!(first_done.status, RunDispatchStatus::Acked);
8465 assert_eq!(second_done.status, RunDispatchStatus::Acked);
8466 assert_eq!(store.acked_signal_count(), 2);
8467 assert_eq!(store.nacked_signal_count(), 0);
8468 }
8469
8470 #[tokio::test]
8471 async fn submit_background_returns_correct_status() {
8472 let store = make_store();
8473 let runtime = make_runtime();
8474 let mailbox = make_mailbox(runtime, store.clone());
8475
8476 let req1 =
8478 RunRequest::new("thread-status", vec![Message::user("a")]).with_agent_id("agent-1");
8479 let result1 = mailbox.submit_background(req1).await.unwrap();
8480 assert!(
8482 matches!(
8483 result1.status,
8484 MailboxDispatchStatus::Running | MailboxDispatchStatus::Queued
8485 ),
8486 "first dispatch should be Running or Queued"
8487 );
8488
8489 let req2 =
8491 RunRequest::new("thread-status", vec![Message::user("b")]).with_agent_id("agent-1");
8492 let result2 = mailbox.submit_background(req2).await.unwrap();
8493 assert!(
8494 matches!(result2.status, MailboxDispatchStatus::Queued),
8495 "second dispatch should be Queued while first is running"
8496 );
8497 }
8498
8499 #[tokio::test]
8500 async fn worker_status_not_corrupted_after_empty_claim() {
8501 let store = make_store();
8502 let runtime = make_runtime();
8503 let mailbox = make_mailbox(runtime, store.clone());
8504
8505 let req =
8507 RunRequest::new("thread-guard", vec![Message::user("a")]).with_agent_id("agent-1");
8508 mailbox.submit_background(req).await.unwrap();
8509
8510 let workers = mailbox.workers.read().await;
8512 if let Some(worker) = workers.get("thread-guard") {
8513 let w = worker.lock();
8514 assert!(
8515 !matches!(w.status, MailboxWorkerStatus::Idle),
8516 "worker should not be Idle after dispatch"
8517 );
8518 }
8519 drop(workers);
8520
8521 mailbox.try_dispatch_next("thread-guard").await;
8523
8524 let workers = mailbox.workers.read().await;
8526 if let Some(worker) = workers.get("thread-guard") {
8527 let w = worker.lock();
8528 assert!(
8529 !matches!(w.status, MailboxWorkerStatus::Idle),
8530 "worker should still not be Idle"
8531 );
8532 }
8533 }
8534
8535 #[test]
8538 fn run_request_extras_corrupt_json_returns_error() {
8539 let corrupt = serde_json::json!({"overrides": "not-an-object", "decisions": 42});
8540 let result = RunRequestExtras::from_value(&corrupt);
8541 assert!(result.is_err(), "corrupt JSON should fail deserialization");
8542 }
8543
8544 #[tokio::test]
8545 async fn submit_inline_claim_fails_when_thread_already_claimed() {
8546 let store = make_store();
8547 let runtime = make_runtime();
8548 let mailbox = make_mailbox(runtime, store.clone());
8549
8550 let req1 =
8552 RunRequest::new("thread-clash", vec![Message::user("first")]).with_agent_id("agent-1");
8553 let result1 = mailbox.submit(req1).await;
8554 assert!(result1.is_ok(), "first submit should succeed");
8555
8556 let req2 =
8559 RunRequest::new("thread-clash", vec![Message::user("second")]).with_agent_id("agent-1");
8560 let result2 = mailbox.submit(req2).await;
8561 match &result2 {
8564 Ok((r, _)) => assert!(!r.dispatch_id.is_empty()),
8565 Err(MailboxError::Validation(_)) => {} Err(e) => panic!("unexpected error: {e}"),
8567 }
8568
8569 let claimed = store
8571 .list_dispatches("thread-clash", Some(&[RunDispatchStatus::Claimed]), 10, 0)
8572 .await
8573 .unwrap();
8574 assert!(
8575 claimed.len() <= 1,
8576 "at most 1 Claimed, got {}",
8577 claimed.len()
8578 );
8579 }
8580
8581 #[tokio::test]
8582 async fn reconnect_sink_returns_false_for_idle_worker() {
8583 let store = make_store();
8584 let runtime = make_runtime();
8585 let mailbox = make_mailbox(runtime, store);
8586
8587 mailbox.get_or_create_worker("thread-idle").await;
8589
8590 let (tx, _rx) = tokio::sync::mpsc::channel(16);
8591 let result = mailbox.reconnect_sink("thread-idle", tx).await;
8592 assert!(!result, "reconnect should fail for idle worker");
8593 }
8594
8595 #[tokio::test]
8596 async fn reconnect_sink_returns_false_for_unknown_thread() {
8597 let store = make_store();
8598 let runtime = make_runtime();
8599 let mailbox = make_mailbox(runtime, store);
8600
8601 let (tx, _rx) = tokio::sync::mpsc::channel(16);
8602 let result = mailbox.reconnect_sink("nonexistent", tx).await;
8603 assert!(!result, "reconnect should fail for unknown thread");
8604 }
8605
8606 #[tokio::test]
8607 async fn reconnect_sink_succeeds_for_running_worker() {
8608 let store = make_store();
8609 let runtime = make_runtime();
8610 let mailbox = make_mailbox(runtime, store);
8611
8612 let worker = mailbox.get_or_create_worker("thread-reconnect").await;
8615 {
8616 let reconnectable = Arc::new(ReconnectableEventSink::new(mpsc::channel(16).0));
8617 let mut w = worker.lock();
8618 w.status = MailboxWorkerStatus::Running {
8619 dispatch_id: "dispatch-fake".into(),
8620 run_id: "run-fake".into(),
8621 lease_handle: tokio::spawn(futures::future::pending::<()>()),
8622 sink: reconnectable,
8623 };
8624 }
8625
8626 let (tx, _rx) = mpsc::channel(16);
8627 let result = mailbox.reconnect_sink("thread-reconnect", tx).await;
8628 assert!(result, "reconnect should succeed for running worker");
8629 }
8630
8631 #[tokio::test]
8632 async fn build_dispatch_extras_roundtrip_with_decisions() {
8633 use awaken_contract::contract::suspension::{ResumeDecisionAction, ToolCallResume};
8634
8635 let decisions = vec![(
8636 "call-1".to_string(),
8637 ToolCallResume {
8638 decision_id: "d-1".into(),
8639 action: ResumeDecisionAction::Resume,
8640 result: serde_json::json!({"approved": true}),
8641 reason: None,
8642 updated_at: 0,
8643 },
8644 )];
8645
8646 let request = RunRequest::new("thread-dec", vec![Message::user("hi")])
8647 .with_agent_id("a1")
8648 .with_decisions(decisions.clone());
8649 let extras = RunRequestExtras::from_request(&request);
8650 assert_eq!(extras.decisions.len(), 1);
8651 assert_eq!(extras.decisions[0].0, "call-1");
8652 }
8653
8654 #[tokio::test]
8655 async fn prepare_run_origin_a2a_roundtrip() {
8656 let store = make_store();
8657 let runtime = make_runtime();
8658 let thread_store = Arc::new(InMemoryStore::new());
8659 let mailbox = Arc::new(Mailbox::new(
8660 runtime,
8661 store,
8662 thread_store.clone(),
8663 "test-consumer".to_string(),
8664 MailboxConfig::default(),
8665 ));
8666
8667 let mut request = RunRequest::new("thread-a2a", vec![Message::user("hi")])
8668 .with_origin(RunRequestOrigin::A2A)
8669 .with_parent_run_id("parent-123");
8670 let (thread_id, messages) =
8671 validate_run_inputs(request.thread_id.clone(), request.messages.clone(), false)
8672 .unwrap();
8673 let run_id = mailbox
8674 .prepare_run_for_dispatch(&mut request, &thread_id, &messages)
8675 .await
8676 .unwrap();
8677 let run = thread_store.load_run(&run_id).await.unwrap().unwrap();
8678
8679 assert!(matches!(
8680 run.request.as_ref().unwrap().origin,
8681 RunRequestOrigin::A2A
8682 ));
8683 assert_eq!(run.parent_run_id.as_deref(), Some("parent-123"));
8684 }
8685
8686 #[test]
8689 fn inline_claim_guard_is_reasonable() {
8690 assert_eq!(INLINE_CLAIM_GUARD_MS, 60_000);
8691 }
8692
8693 #[test]
8696 fn nack_backoff_progression() {
8697 let config = MailboxConfig::default();
8698 for (attempt_count, expected_ms) in [
8703 (1, 250), (2, 500), (3, 1000), (4, 2000), (5, 4000), (6, 8000), (7, 16000), ] {
8711 let backoff_factor = 2u64.pow((attempt_count as u32).saturating_sub(1).min(6));
8712 let delay =
8713 (config.default_retry_delay_ms * backoff_factor).min(config.max_retry_delay_ms);
8714 assert_eq!(delay, expected_ms, "attempt_count={attempt_count}");
8715 }
8716 }
8717
8718 #[test]
8719 fn nack_backoff_caps_at_max() {
8720 let config = MailboxConfig {
8721 max_retry_delay_ms: 5000,
8722 default_retry_delay_ms: 1000,
8723 ..Default::default()
8724 };
8725 let backoff_factor = 2u64.pow(3);
8727 let delay = (config.default_retry_delay_ms * backoff_factor).min(config.max_retry_delay_ms);
8728 assert_eq!(delay, 5000);
8729 }
8730
8731 #[test]
8732 fn nack_backoff_zero_attempt_is_base_delay() {
8733 let config = MailboxConfig::default();
8734 let backoff_factor = 2u64.pow(0u32.saturating_sub(1).min(6));
8737 let delay = (config.default_retry_delay_ms * backoff_factor).min(config.max_retry_delay_ms);
8738 assert_eq!(delay, 250);
8739 }
8740
8741 #[test]
8742 fn nack_backoff_high_attempt_stays_capped() {
8743 let config = MailboxConfig::default();
8744 let backoff_factor = 2u64.pow(100u32.saturating_sub(1).min(6));
8746 let delay = (config.default_retry_delay_ms * backoff_factor).min(config.max_retry_delay_ms);
8747 assert_eq!(delay, 16000);
8748
8749 let config2 = MailboxConfig {
8751 max_retry_delay_ms: 10_000,
8752 ..Default::default()
8753 };
8754 let delay2 =
8755 (config2.default_retry_delay_ms * backoff_factor).min(config2.max_retry_delay_ms);
8756 assert_eq!(delay2, 10_000);
8757 }
8758
8759 #[tokio::test]
8762 async fn gc_idle_workers_removes_idle_with_no_dispatches() {
8763 let store = make_store();
8764 let runtime = make_runtime();
8765 let mailbox = make_mailbox(runtime, store.clone());
8766
8767 {
8769 let mut workers = mailbox.workers.write().await;
8770 workers.insert(
8771 "thread-gc".to_string(),
8772 Arc::new(SyncMutex::new(MailboxWorker::default())),
8773 );
8774 }
8775
8776 assert!(mailbox.workers.read().await.contains_key("thread-gc"));
8778
8779 mailbox.gc_idle_workers().await;
8781
8782 assert!(
8783 !mailbox.workers.read().await.contains_key("thread-gc"),
8784 "idle worker with no queued dispatches should be removed"
8785 );
8786 }
8787
8788 #[tokio::test]
8789 async fn gc_idle_workers_keeps_worker_with_queued_dispatches() {
8790 let store = make_store();
8791 let runtime = make_runtime();
8792 let mailbox = make_mailbox(runtime, store.clone());
8793
8794 let request =
8796 RunRequest::new("thread-gc-keep", vec![Message::user("hi")]).with_agent_id("agent-1");
8797 mailbox.submit_background(request).await.unwrap();
8798
8799 {
8802 let mut workers = mailbox.workers.write().await;
8803 workers.insert(
8804 "thread-gc-keep".to_string(),
8805 Arc::new(SyncMutex::new(MailboxWorker::default())),
8806 );
8807 }
8808
8809 mailbox.gc_idle_workers().await;
8811
8812 let has_dispatches = !store
8814 .list_dispatches(
8815 "thread-gc-keep",
8816 Some(&[RunDispatchStatus::Queued, RunDispatchStatus::Claimed]),
8817 1,
8818 0,
8819 )
8820 .await
8821 .unwrap()
8822 .is_empty();
8823 if has_dispatches {
8824 assert!(
8825 mailbox.workers.read().await.contains_key("thread-gc-keep"),
8826 "idle worker with queued dispatches should NOT be removed"
8827 );
8828 }
8829 }
8830
8831 #[tokio::test]
8832 async fn gc_idle_workers_noop_when_empty() {
8833 let store = make_store();
8834 let runtime = make_runtime();
8835 let mailbox = make_mailbox(runtime, store);
8836
8837 mailbox.gc_idle_workers().await;
8839 let workers = mailbox.workers.read().await;
8840 assert!(workers.is_empty());
8841 }
8842
8843 fn make_run_record(run_id: &str, thread_id: &str, status: RunStatus) -> RunRecord {
8846 RunRecord {
8847 run_id: run_id.to_string(),
8848 thread_id: thread_id.to_string(),
8849 agent_id: "agent".to_string(),
8850 parent_run_id: None,
8851 request: None,
8852 input: None,
8853 output: None,
8854 status,
8855 termination_reason: None,
8856 final_output: None,
8857 error_payload: None,
8858 dispatch_id: None,
8859 session_id: None,
8860 transport_request_id: None,
8861 waiting: None,
8862 outcome: None,
8863 created_at: 1,
8864 started_at: None,
8865 finished_at: None,
8866 updated_at: 1,
8867 steps: 0,
8868 input_tokens: 0,
8869 output_tokens: 0,
8870 state: None,
8871 }
8872 }
8873
8874 fn make_waiting_run_record(run_id: &str, thread_id: &str) -> RunRecord {
8875 let mut run = make_run_record(run_id, thread_id, RunStatus::Waiting);
8876 run.waiting = Some(RunWaitingState {
8877 reason: WaitingReason::BackgroundTasks,
8878 ticket_ids: Vec::new(),
8879 tickets: Vec::new(),
8880 since_dispatch_id: None,
8881 message: None,
8882 });
8883 run
8884 }
8885
8886 fn make_noop_mailbox(thread_store: Arc<InMemoryStore>) -> Arc<Mailbox> {
8887 let mailbox_store = make_store();
8888 let runtime: Arc<dyn RunDispatchExecutor> = Arc::new(NoopMailboxRuntime);
8889 Arc::new(Mailbox::new_with_executor(
8890 runtime,
8891 mailbox_store,
8892 thread_store,
8893 "test-consumer".into(),
8894 MailboxConfig::default(),
8895 ))
8896 }
8897
8898 #[tokio::test]
8899 async fn thread_context_cache_used_by_reusable_waiting_run_id() {
8900 let thread_store = Arc::new(InMemoryStore::new());
8901 let mailbox = make_noop_mailbox(thread_store.clone());
8902 let thread_id = "thread-ctx-reuse";
8903
8904 let run = make_waiting_run_record("run-waiting", thread_id);
8906 thread_store
8907 .checkpoint(thread_id, &[Message::user("hi")], &run)
8908 .await
8909 .unwrap();
8910
8911 let worker = mailbox.get_or_create_worker(thread_id).await;
8913 let ctx = ThreadContext::load(thread_store.as_ref(), thread_id)
8914 .await
8915 .unwrap();
8916 {
8917 let mut w = worker.lock();
8918 w.thread_ctx = Some(ctx);
8919 }
8920
8921 let result = mailbox.reusable_waiting_run_id(thread_id).await;
8922 assert_eq!(result, Some("run-waiting".to_string()));
8923 }
8924
8925 #[tokio::test]
8926 async fn thread_context_cache_updated_after_prepare_checkpoint() {
8927 let thread_store = Arc::new(InMemoryStore::new());
8928 let mailbox = make_noop_mailbox(thread_store.clone());
8929 let thread_id = "thread-ctx-checkpoint";
8930
8931 let run = make_run_record("run-prev", thread_id, RunStatus::Done);
8933 thread_store
8934 .checkpoint(thread_id, &[Message::user("first")], &run)
8935 .await
8936 .unwrap();
8937
8938 let worker = mailbox.get_or_create_worker(thread_id).await;
8940 let ctx = ThreadContext::load(thread_store.as_ref(), thread_id)
8941 .await
8942 .unwrap();
8943 {
8944 let mut w = worker.lock();
8945 w.thread_ctx = Some(ctx);
8946 }
8947
8948 let mut request =
8950 RunRequest::new(thread_id, vec![Message::user("second")]).with_agent_id("agent");
8951 let msgs = request.messages.clone();
8952 mailbox
8953 .prepare_run_for_dispatch(&mut request, thread_id, &msgs)
8954 .await
8955 .expect("prepare should succeed");
8956
8957 let w = worker.lock();
8959 let ctx = w.thread_ctx.as_ref().expect("cache should exist");
8960 assert_eq!(ctx.messages.len(), 2, "cache should have 2 messages");
8961 assert!(ctx.latest_run.is_some(), "cache should have latest run");
8962 }
8963
8964 #[tokio::test]
8965 async fn prepare_run_falls_back_to_store_without_cache() {
8966 let thread_store = Arc::new(InMemoryStore::new());
8967 let mailbox = make_noop_mailbox(thread_store.clone());
8968 let thread_id = "thread-no-cache";
8969
8970 let run = make_run_record("run-prev", thread_id, RunStatus::Done);
8972 thread_store
8973 .checkpoint(thread_id, &[Message::user("first")], &run)
8974 .await
8975 .unwrap();
8976
8977 let mut request =
8979 RunRequest::new(thread_id, vec![Message::user("second")]).with_agent_id("agent");
8980 let msgs = request.messages.clone();
8981 let run_id = mailbox
8982 .prepare_run_for_dispatch(&mut request, thread_id, &msgs)
8983 .await
8984 .expect("should succeed from store fallback");
8985 assert!(!run_id.is_empty());
8986
8987 let stored = thread_store
8989 .load_messages(thread_id)
8990 .await
8991 .unwrap()
8992 .unwrap();
8993 assert_eq!(stored.len(), 2);
8994 }
8995
8996 #[tokio::test]
8997 async fn prepare_run_uses_durable_messages_when_active_cache_is_stale() {
8998 let thread_store = Arc::new(InMemoryStore::new());
8999 let mailbox = make_noop_mailbox(thread_store.clone());
9000 let thread_id = "thread-stale-cache";
9001
9002 let active = make_run_record("run-active", thread_id, RunStatus::Running);
9003 thread_store
9004 .checkpoint(thread_id, &[Message::user("first")], &active)
9005 .await
9006 .unwrap();
9007
9008 let worker = mailbox.get_or_create_worker(thread_id).await;
9010 let stale_ctx = ThreadContext::load(thread_store.as_ref(), thread_id)
9011 .await
9012 .unwrap();
9013 {
9014 let mut w = worker.lock();
9015 w.thread_ctx = Some(stale_ctx);
9016 }
9017
9018 thread_store
9021 .checkpoint(
9022 thread_id,
9023 &[Message::user("first"), Message::assistant("active output")],
9024 &active,
9025 )
9026 .await
9027 .unwrap();
9028
9029 let mut request =
9030 RunRequest::new(thread_id, vec![Message::user("second")]).with_agent_id("agent");
9031 let msgs = request.messages.clone();
9032 mailbox
9033 .prepare_run_for_dispatch(&mut request, thread_id, &msgs)
9034 .await
9035 .expect("prepare should preserve active-run checkpoint");
9036
9037 let stored = thread_store
9038 .load_messages(thread_id)
9039 .await
9040 .unwrap()
9041 .unwrap();
9042 assert_eq!(stored.len(), 3);
9043 assert_eq!(stored[1].text(), "active output");
9044 assert_eq!(stored[2].text(), "second");
9045 }
9046
9047 #[tokio::test]
9048 async fn reusable_waiting_run_id_ignores_stale_worker_cache() {
9049 let thread_store = Arc::new(InMemoryStore::new());
9050 let mailbox = make_noop_mailbox(thread_store.clone());
9051 let thread_id = "thread-stale-waiting-cache";
9052
9053 let waiting = make_waiting_run_record("run-waiting", thread_id);
9054 thread_store
9055 .checkpoint(thread_id, &[Message::user("hi")], &waiting)
9056 .await
9057 .unwrap();
9058
9059 let worker = mailbox.get_or_create_worker(thread_id).await;
9060 let stale_ctx = ThreadContext::load(thread_store.as_ref(), thread_id)
9061 .await
9062 .unwrap();
9063 {
9064 let mut w = worker.lock();
9065 w.thread_ctx = Some(stale_ctx);
9066 }
9067
9068 let done = make_run_record("run-waiting", thread_id, RunStatus::Done);
9069 thread_store
9070 .checkpoint(
9071 thread_id,
9072 &[Message::user("hi"), Message::assistant("done")],
9073 &done,
9074 )
9075 .await
9076 .unwrap();
9077
9078 assert_eq!(mailbox.reusable_waiting_run_id(thread_id).await, None);
9079 }
9080
9081 #[tokio::test]
9082 async fn thread_context_cache_cleared_on_idle_transition() {
9083 let thread_store = Arc::new(InMemoryStore::new());
9084 let mailbox = make_noop_mailbox(thread_store.clone());
9085 let thread_id = "thread-ctx-clear";
9086
9087 let run = make_run_record("r1", thread_id, RunStatus::Done);
9088 thread_store
9089 .checkpoint(thread_id, &[Message::user("hi")], &run)
9090 .await
9091 .unwrap();
9092
9093 let worker = mailbox.get_or_create_worker(thread_id).await;
9095 let ctx = ThreadContext::load(thread_store.as_ref(), thread_id)
9096 .await
9097 .unwrap();
9098 {
9099 let mut w = worker.lock();
9100 w.thread_ctx = Some(ctx);
9101 w.status = MailboxWorkerStatus::Running {
9102 dispatch_id: "d1".into(),
9103 run_id: "r1".into(),
9104 lease_handle: tokio::spawn(async {}),
9105 sink: Arc::new(ReconnectableEventSink::new(mpsc::channel(16).0)),
9106 };
9107 }
9108
9109 assert!(worker.lock().thread_ctx.is_some());
9111
9112 {
9114 let mut w = worker.lock();
9115 let old = std::mem::replace(&mut w.status, MailboxWorkerStatus::Idle);
9116 w.thread_ctx = None;
9117 if let MailboxWorkerStatus::Running { lease_handle, .. } = old {
9118 lease_handle.abort();
9119 }
9120 }
9121
9122 assert!(
9123 worker.lock().thread_ctx.is_none(),
9124 "cache should be cleared on idle transition"
9125 );
9126 }
9127
9128 #[tokio::test]
9129 async fn thread_context_load_populates_run_cache() {
9130 let store = Arc::new(InMemoryStore::new());
9131 let thread_id = "thread-load-test";
9132
9133 let run = make_run_record("r1", thread_id, RunStatus::Done);
9134 store
9135 .checkpoint(thread_id, &[Message::user("msg")], &run)
9136 .await
9137 .unwrap();
9138
9139 let ctx = ThreadContext::load(store.as_ref(), thread_id)
9140 .await
9141 .expect("load should succeed");
9142
9143 assert_eq!(ctx.messages.len(), 1);
9144 assert!(ctx.latest_run.is_some());
9145 assert_eq!(ctx.latest_run.as_ref().unwrap().run_id, "r1");
9146 assert!(ctx.get_run("r1").is_some());
9147 assert!(ctx.get_run("unknown").is_none());
9148 }
9149
9150 #[tokio::test]
9151 async fn reusable_waiting_run_id_returns_none_for_done_cached_run() {
9152 let thread_store = Arc::new(InMemoryStore::new());
9153 let mailbox = make_noop_mailbox(thread_store.clone());
9154 let thread_id = "thread-done-run";
9155
9156 let run = make_run_record("run-done", thread_id, RunStatus::Done);
9157 thread_store
9158 .checkpoint(thread_id, &[Message::user("hi")], &run)
9159 .await
9160 .unwrap();
9161
9162 let worker = mailbox.get_or_create_worker(thread_id).await;
9163 let ctx = ThreadContext::load(thread_store.as_ref(), thread_id)
9164 .await
9165 .unwrap();
9166 {
9167 let mut w = worker.lock();
9168 w.thread_ctx = Some(ctx);
9169 }
9170
9171 let result = mailbox.reusable_waiting_run_id(thread_id).await;
9172 assert_eq!(result, None, "Done run should not be reusable");
9173 }
9174}