edict/commands/
worker_loop.rs

1use std::path::{Path, PathBuf};
2use std::process;
3
4use anyhow::Context;
5
6use crate::config::Config;
7use crate::subprocess::Tool;
8
9/// Worker loop state and configuration.
10pub struct WorkerLoop {
11    project_root: PathBuf,
12    agent: String,
13    project: String,
14    model_pool: Vec<String>,
15    timeout: u64,
16    review_enabled: bool,
17    critical_approvers: Vec<String>,
18    dispatched_bone: Option<String>,
19    dispatched_workspace: Option<String>,
20    dispatched_mission: Option<String>,
21    dispatched_siblings: Option<String>,
22    dispatched_mission_outcome: Option<String>,
23    dispatched_file_hints: Option<String>,
24}
25
26impl WorkerLoop {
27    /// Create a new worker loop instance.
28    pub fn new(
29        project_root: Option<PathBuf>,
30        agent: Option<String>,
31        model: Option<String>,
32    ) -> anyhow::Result<Self> {
33        let project_root = project_root
34            .or_else(|| std::env::current_dir().ok())
35            .context("determining project root")?;
36
37        // Find and load config
38        let config = load_config(&project_root)?;
39
40        // Agent name: CLI arg > auto-generated (empty for worker)
41        let agent = agent.unwrap_or_default();
42
43        // Set AGENT and BOTBUS_AGENT env so spawned tools resolve identity correctly
44        // SAFETY: single-threaded at this point in startup, before spawning any threads
45        if !agent.is_empty() {
46            unsafe {
47                std::env::set_var("AGENT", &agent);
48                std::env::set_var("BOTBUS_AGENT", &agent);
49            }
50        }
51
52        // Apply config [env] vars to our own process so tools we invoke (cargo, etc.) inherit them
53        let resolved_env = config.resolved_env();
54        for (k, v) in &resolved_env {
55            // SAFETY: single-threaded at startup
56            unsafe {
57                std::env::set_var(k, v);
58            }
59        }
60
61        // Emit startup diagnostic for build-related env vars.
62        // This confirms whether vars from .botbox.toml [env] actually reach
63        // the process where cargo runs, helping diagnose OOM issues from
64        // unthrottled parallel builds in multi-agent setups.
65        emit_build_env_diagnostic(&resolved_env);
66
67        // Project name from config
68        let project = config.channel();
69
70        // Model: CLI arg > config > default, then resolve to pool for fallback
71        let worker_config = config.agents.worker.as_ref();
72        let model_raw = model
73            .or_else(|| worker_config.map(|w| w.model.clone()))
74            .unwrap_or_default();
75        let model_pool = config.resolve_model_pool(&model_raw);
76
77        let timeout = worker_config.map(|w| w.timeout).unwrap_or(900);
78        let review_enabled = config.review.enabled;
79        let critical_approvers = config
80            .project
81            .critical_approvers
82            .clone()
83            .unwrap_or_default();
84
85        // Dispatched worker env vars (set by dev-loop)
86        // Validate bone ID format: bd-XXXX (alphanumeric + hyphens)
87        let dispatched_bone = std::env::var("EDICT_BONE")
88            .or_else(|_| std::env::var("EDICT_BEAD"))
89            .ok()
90            .filter(|v| {
91                !v.is_empty()
92                    && v.len() <= 20
93                    && v.bytes().all(|b| b.is_ascii_alphanumeric() || b == b'-')
94            });
95        // Validate workspace name: lowercase alphanumeric + hyphens, no path components
96        let dispatched_workspace = std::env::var("EDICT_WORKSPACE").ok().filter(|v| {
97            !v.is_empty()
98                && v.len() <= 64
99                && v.bytes()
100                    .all(|b| b.is_ascii_lowercase() || b.is_ascii_digit() || b == b'-')
101                && !v.starts_with('-')
102                && !v.contains("..")
103        });
104        // Mission ID has same format as bone ID
105        let dispatched_mission = std::env::var("EDICT_MISSION").ok().filter(|v| {
106            !v.is_empty()
107                && v.len() <= 20
108                && v.bytes().all(|b| b.is_ascii_alphanumeric() || b == b'-')
109        });
110        // Siblings and file hints are informational — limit size to prevent prompt bloat
111        let dispatched_siblings = std::env::var("EDICT_SIBLINGS").ok().map(|v| {
112            if v.len() > 4096 {
113                v[..v.floor_char_boundary(4096)].to_string()
114            } else {
115                v
116            }
117        });
118        let dispatched_mission_outcome = std::env::var("EDICT_MISSION_OUTCOME").ok().map(|v| {
119            if v.len() > 2048 {
120                v[..v.floor_char_boundary(2048)].to_string()
121            } else {
122                v
123            }
124        });
125        let dispatched_file_hints = std::env::var("EDICT_FILE_HINTS").ok().map(|v| {
126            if v.len() > 4096 {
127                v[..v.floor_char_boundary(4096)].to_string()
128            } else {
129                v
130            }
131        });
132
133        Ok(Self {
134            project_root,
135            agent,
136            project,
137            model_pool,
138            timeout,
139            review_enabled,
140            critical_approvers,
141            dispatched_bone,
142            dispatched_workspace,
143            dispatched_mission,
144            dispatched_siblings,
145            dispatched_mission_outcome,
146            dispatched_file_hints,
147        })
148    }
149
150    /// Run one iteration of the worker loop.
151    pub fn run_once(&self) -> anyhow::Result<LoopStatus> {
152        // Set up cleanup handlers
153        register_cleanup_handlers(&self.agent, &self.project);
154
155        // Build prompt for Claude
156        let prompt = self.build_prompt();
157
158        // Run agent via edict run agent (Pi by default), with rate limit fallback
159        let start = crate::telemetry::metrics::time_start();
160        let output = run_agent_with_fallback(&prompt, &self.model_pool, self.timeout)?;
161        crate::telemetry::metrics::time_record(
162            "edict.worker.agent_run_duration_seconds",
163            start,
164            &[("agent", &self.agent), ("project", &self.project)],
165        );
166
167        // Parse completion signal
168        let status = parse_completion_signal(&output);
169        crate::telemetry::metrics::counter(
170            "edict.worker.runs_total",
171            1,
172            &[("agent", &self.agent), ("project", &self.project)],
173        );
174
175        Ok(status)
176    }
177
178    /// Build the worker loop prompt.
179    fn build_prompt(&self) -> String {
180        let dispatched_section = if let (Some(bone), Some(ws)) =
181            (&self.dispatched_bone, &self.dispatched_workspace)
182        {
183            let mission_section = if let Some(ref mission) = self.dispatched_mission {
184                let outcome = if let Some(ref outcome) = self.dispatched_mission_outcome {
185                    format!("Mission outcome: {outcome}")
186                } else {
187                    format!("Read mission context: maw exec default -- bn show {mission}")
188                };
189
190                let siblings = if let Some(ref sibs) = self.dispatched_siblings {
191                    format!("\nSibling bones (other workers in this mission):\n{sibs}")
192                } else {
193                    String::new()
194                };
195
196                let file_hints = if let Some(ref hints) = self.dispatched_file_hints {
197                    format!(
198                        "\nAdvisory file ownership (avoid editing files owned by siblings):\n{hints}"
199                    )
200                } else {
201                    String::new()
202                };
203
204                format!("Mission: {mission}\n{outcome}{siblings}{file_hints}")
205            } else {
206                String::new()
207            };
208
209            let ws_path = self.project_root.join("ws").join(ws);
210
211            format!(
212                r#"## DISPATCHED WORKER — FAST PATH
213
214You were dispatched by a lead dev agent with a pre-assigned bone and workspace.
215Skip steps 0 (RESUME CHECK), 1 (INBOX), and 2 (TRIAGE) entirely.
216
217Pre-assigned bone: {bone}
218Pre-assigned workspace: {ws}
219Workspace path: {ws_path}
220{mission_section}
221
222Go directly to:
2231. Verify your bone: maw exec default -- bn show {bone}
2242. Verify your workspace: maw ws list (confirm {ws} exists)
2253. Your bone is already doing and claimed. Proceed to step 4 (WORK).
226   Use absolute workspace path: {ws_path}
227   For commands in workspace: maw exec {ws} -- <command>
228
229"#,
230                bone = bone,
231                ws = ws,
232                ws_path = ws_path.display(),
233                mission_section = mission_section,
234            )
235        } else {
236            String::new()
237        };
238
239        let dispatched_intro = if self.dispatched_bone.is_some() {
240            "You are a dispatched worker — follow the FAST PATH section below."
241        } else {
242            r#"Execute exactly ONE cycle of the worker loop. Complete one task (or determine there is no work),
243then STOP. Do not start a second task — the outer loop handles iteration."#
244        };
245
246        let review_step_6 = if self.review_enabled {
247            format!(
248                r#"6. REVIEW REQUEST (risk-aware):
249   First, check the bone's risk label: maw exec default -- bn show <id> — look for risk:low, risk:high, or risk:critical labels.
250   No risk label = risk:medium (standard review, current default).
251
252   RISK:LOW PATH (evals, docs, tests, config) — Self-review and merge directly:
253     No security review needed regardless of REVIEW setting.
254     Add self-review comment: maw exec default -- bn bone comment add <id> "Self-review (risk:low): <what you verified>"
255     Proceed directly to step 7 (FINISH).
256
257   RISK:MEDIUM PATH — Standard review (current default):
258     Try protocol command: edict protocol review <bone-id> --agent {agent}
259     Read the output carefully. If status is Ready, run the suggested commands.
260     If it fails (exit 1 = command unavailable), fall back to manual review request:
261       CHECK for existing review first:
262         - Run: maw exec default -- bn comments <id> | grep "Review created:"
263         - If found, extract <review-id> and skip to requesting review (don't create duplicate)
264       Create review with reviewer assignment (only if none exists):
265         - maw exec $WS -- crit reviews create --agent {agent} --title "<id>: <title>" --description "<summary>" --reviewers {project}-security
266         - IMMEDIATELY record: maw exec default -- bn bone comment add <id> "Review created: <review-id> in workspace $WS"
267       bus statuses set --agent {agent} "Review: <review-id>".
268       Spawn reviewer via @mention: bus send --agent {agent} {project} "Review requested: <review-id> for <id> @{project}-security" -L review-request
269     Do NOT close the bone. Do NOT merge. Do NOT release claims.
270     Output: <promise>COMPLETE</promise>
271     STOP this iteration.
272
273   RISK:HIGH PATH — Security review + failure-mode checklist:
274     Same as risk:medium, but when creating the review, add to description: "risk:high — failure-mode checklist required."
275     The security reviewer will include the 5 failure-mode questions in their review:
276       1. What could fail in production?  2. How would we detect it quickly?
277       3. What is the fastest safe rollback?  4. What dependency could invalidate this plan?
278       5. What assumption is least certain?
279     MUST request security reviewer. Do not skip.
280     STOP this iteration.
281
282   RISK:CRITICAL PATH — Security review + human approval required:
283     Same as risk:high, but ALSO:
284     - Add to review description: "risk:critical — REQUIRES HUMAN APPROVAL before merge."
285     - Post to bus requesting human approval:
286       bus send --agent {agent} {project} "risk:critical review for <id>: requires human approval before merge. {critical_approvers}" -L review-request
287     STOP this iteration."#,
288                agent = self.agent,
289                project = self.project,
290                critical_approvers = if self.critical_approvers.is_empty() {
291                    "Check project.critical_approvers in .edict.toml".to_string()
292                } else {
293                    format!("Approvers: {}", self.critical_approvers.join(", "))
294                }
295            )
296        } else {
297            r#"   REVIEW is disabled. Skip code review.
298   Proceed directly to step 7 (FINISH)."#
299                .to_string()
300        };
301
302        let finish_step_7 = if self.dispatched_bone.is_some() {
303            format!(
304                r#"7. FINISH (dispatched worker — lead handles merge):
305   Try protocol command: edict protocol finish <bone-id> --agent {agent} --no-merge
306   Read the output carefully. If status is Ready, run the suggested commands.
307   If it fails (exit 1 = command unavailable), fall back to manual finish:
308     Close bone: maw exec default -- bn done <id> --reason "Completed"
309     Announce: bus send --agent {agent} {project} "Completed <id>: <title>" -L task-done
310     Release bone claim: bus claims release --agent {agent} "bone://{project}/<id>"
311     Do NOT merge the workspace — the lead dev will handle merging via the merge protocol.
312     Do NOT run the release check — the lead handles releases.
313   Output: <promise>COMPLETE</promise>"#,
314                agent = self.agent,
315                project = self.project,
316            )
317        } else {
318            format!(
319                r#"7. FINISH (only reached after LGTM from step 0, or after step 6 when REVIEW is false):
320   Try protocol command: edict protocol finish <bone-id> --agent {agent}
321   Read the output carefully. If status is Ready, run the suggested commands.
322   If it fails (exit 1 = command unavailable), fall back to manual finish:
323     If a review was conducted:
324       maw exec default -- crit reviews mark-merged <review-id> --agent {agent}.
325     RISK:CRITICAL CHECK — Before merging a risk:critical bone:
326       Verify human approval exists: bus history {project} -n 50 -L review-request | look for approval message referencing this bone/review from an authorized approver.
327       If no approval found, do NOT merge. Post: bus send --agent {agent} {project} "Waiting for human approval on risk:critical <id>" -L review-request. STOP.
328       If approval found, record it: maw exec default -- bn bone comment add <id> "Human approval: <approver> via bus message <msg-id>"
329     maw exec default -- bn bone comment add <id> "Completed by {agent}".
330     maw exec default -- bn done <id> --reason "Completed" --suggest-next.
331     bus send --agent {agent} {project} "Completed <id>: <title>" -L task-done.
332     bus claims release --agent {agent} "bone://{project}/<id>".
333     Keep workspace claim — the lead will merge it.
334   STOP — do not proceed to RELEASE CHECK (only leads check for releases after merging)."#,
335                agent = self.agent,
336                project = self.project,
337            )
338        };
339
340        let review_status_str = if self.review_enabled { "true" } else { "false" };
341        let review_note = if self.review_enabled {
342            "risk:low (evals, docs, tests, config) skips security review — self-review and merge directly. risk:medium gets standard review. risk:high requires failure-mode checklist. risk:critical requires human approval."
343        } else {
344            "Review is disabled. Skip review and proceed to FINISH after describing commit."
345        };
346
347        format!(
348            r#"You are worker agent "{agent}" for project "{project}".
349
350IMPORTANT: Use --agent {agent} on ALL bus and crit commands. bn resolves agent identity from $AGENT/$BOTBUS_AGENT env automatically. Set EDICT_PROJECT={project}.
351
352CRITICAL - HUMAN MESSAGE PRIORITY: If you see a system reminder with "STOP:" showing unread bus messages, these are from humans or other agents trying to reach you. IMMEDIATELY check inbox and respond before continuing your current task. Human questions, clarifications, and redirects take priority over heads-down work.
353
354COMMAND PATTERN — maw exec: All bn commands run in the default workspace. All crit commands run in their workspace.
355  bn:   maw exec default -- bn <args>
356  crit: maw exec $WS -- crit <args>
357  other: maw exec $WS -- <command>           (cargo test, etc.)
358
359VERSION CONTROL: This project uses Git + maw. Do NOT run jj commands.
360  Workers commit with: maw exec $WS -- git add -A && maw exec $WS -- git commit -m "<message>"
361  The lead handles merging workspaces into main.
362
363{dispatched}{dispatched_intro}
364
365At the end of your work, output exactly one of these completion signals:
366- <promise>COMPLETE</promise> if you completed a task or determined there is no work
367- <promise>BLOCKED</promise> if you are stuck and cannot proceed
368
3690. RESUME CHECK (do this FIRST):
370   Try protocol command: edict protocol resume --agent {agent}
371   If it fails (exit 1 = command unavailable), fall back to manual resume check:
372     Run: bus claims list --agent {agent} --mine
373     If you hold a bone:// claim, you have an in-progress bone from a previous iteration.
374     - Run: maw exec default -- bn comments <bone-id> to understand what was done before and what remains.
375     - Look for workspace info in comments (workspace name and path).
376     - If a "Review created: <review-id>" comment exists:
377       * Find the review: maw exec $WS -- crit review <review-id>
378       * Check review status: maw exec $WS -- crit review <review-id>
379       * If LGTM (approved): proceed to FINISH (step 7) — merge the review and close the bone.
380       * If BLOCKED (changes requested): fix the issues, then re-request review:
381         1. Read threads: maw exec $WS -- crit review <review-id> (threads show inline with comments)
382         2. For each unresolved thread with reviewer feedback:
383            - Fix the code in the workspace (use absolute WS_PATH for file edits)
384            - Reply: maw exec $WS -- crit reply <thread-id> --agent {agent} "Fixed: <what you did>"
385            - Resolve: maw exec $WS -- crit threads resolve <thread-id> --agent {agent}
386         3. Re-request: maw exec $WS -- crit reviews request <review-id> --reviewers {project}-security --agent {agent}
387         5. Announce: bus send --agent {agent} {project} "Review updated: <review-id> — addressed feedback @{project}-security" -L review-response
388         STOP this iteration — wait for re-review.
389       * If PENDING (no votes yet): STOP this iteration. Wait for the reviewer.
390       * If review not found: DO NOT merge or create a new review. The reviewer may still be starting up (hooks have latency). STOP this iteration and wait. Only create a new review if the workspace was destroyed AND 3+ iterations have passed since the review comment.
391     - If no review comment (work was in progress when session ended):
392       * Read the workspace code to see what's already done.
393       * Complete the remaining work in the EXISTING workspace — do NOT create a new one.
394       * After completing: maw exec default -- bn bone comment add <id> "Resumed and completed: <what you finished>".
395       * Then proceed to step 6 (REVIEW REQUEST) or step 7 (FINISH).
396     If no active claims: proceed to step 1 (INBOX).
397
3981. INBOX (do this before triaging):
399   Run: bus inbox --agent {agent} --channels {project} --mark-read
400   For each message:
401   - Task request (-L task-request or asks for work): create a bone with maw exec default -- bn create.
402   - Status check or question: reply on bus, do NOT create a bone.
403   - Feedback (-L feedback): if it contains a bug report, feature request, or actionable work — create a bone. Evaluate critically: is this a real issue? Is it well-scoped? Set priority accordingly. Then acknowledge on bus.
404   - Announcements from other agents ("Working on...", "Completed...", "online"): ignore, no action.
405   - Duplicate of existing bone: do NOT create another bone, note it covers the request.
406
4072. TRIAGE: Check maw exec default -- bn next. If no ready bones and inbox created none, say "NO_WORK_AVAILABLE" and stop.
408   GROOM each ready bone (maw exec default -- bn show <id>): ensure clear title, description with acceptance criteria
409   and testing strategy, appropriate priority, and risk label. Fix anything missing, comment what you changed.
410   RISK LABELS: Assess each bone for risk using these dimensions: blast radius, data sensitivity, reversibility, dependency uncertainty.
411   - risk:low — typo fixes, doc updates, config tweaks (add label: bn bone tag <id> risk:low)
412   - risk:medium — standard features/bugs (default, no label needed)
413   - risk:high — security-sensitive, data integrity, user-visible behavior changes (add label)
414   - risk:critical — irreversible actions, migrations, regulated changes (add label)
415   Any agent can escalate risk upward. Downgrades require lead approval with justification comment.
416   Use maw exec default -- bn --robot-next to pick exactly one small task. If the task is large, break it down with
417   maw exec default -- bn create + bn triage dep add, then bn next again. If a bone is claimed
418   (bus claims check --agent {agent} "bone://{project}/<id>"), skip it.
419
420   MISSION CONTEXT: After picking a bone, check if it has a mission:bd-xxx label (visible in bn show output).
421   If it does, read the mission bone for shared context:
422     maw exec default -- bn show <mission-id>
423   Note the mission's Outcome, Constraints, and Stop criteria. Check siblings:
424     maw exec default -- bn list -l "mission:<mission-id>"
425   Use this context to understand how your work fits into the larger effort.
426
427   SIBLING COORDINATION (missions only):
428   When working on a mission bone, you share the codebase with sibling workers. Coordinate through bus:
429
430   READ siblings: Before editing a file listed in EDICT_FILE_HINTS as owned by a sibling, and periodically
431   during work (~every 5 minutes), check for sibling messages:
432     bus history {project} -n 10 -L "mission:<mission-id>" --since "5 minutes ago"
433   Look for coord:interface messages — these tell you about API/schema/config changes siblings made.
434   If a sibling changed something you depend on, adapt your implementation to match.
435
436   POST discoveries: When you change an API, schema, config format, shared type, or exported interface
437   that siblings might depend on, announce it immediately:
438     bus send --agent {agent} {project} "<file>: <what changed and why>" -L coord:interface -L "mission:<mission-id>"
439
440   COORDINATION LABELS on bus messages:
441   - coord:interface — API/schema/config changes that affect siblings
442   - coord:blocker — You need something from a sibling: bus send --agent {agent} {project} "Blocked by <sibling-bone>: <reason>" -L coord:blocker -L "mission:<mission-id>"
443   - task-done — Signal completion: bus send --agent {agent} {project} "Completed <id>" -L task-done -L "mission:<mission-id>"
444
4453. START: Try protocol command: edict protocol start <bone-id> --agent {agent}
446   Read the output carefully. If status is Ready, run the suggested commands.
447   If it fails (exit 1 = command unavailable), fall back to manual start:
448     maw exec default -- bn do <id>.
449     bus claims stake --agent {agent} "bone://{project}/<id>" -m "<id>".
450     Create workspace: run maw ws create --random. Note the workspace name AND absolute path
451     from the output (e.g., name "frost-castle", path "/abs/path/ws/frost-castle").
452     Store the name as WS and the absolute path as WS_PATH.
453     IMPORTANT: All file operations (Read, Write, Edit) must use the absolute WS_PATH.
454     For commands in the workspace: maw exec $WS -- <command>.
455     Do NOT cd into the workspace and stay there — the workspace is destroyed during finish.
456     bus claims stake --agent {agent} "workspace://{project}/$WS" -m "<id>".
457     maw exec default -- bn bone comment add <id> "Started in workspace $WS ($WS_PATH)".
458     bus statuses set --agent {agent} "Working: <id>" --ttl 30m.
459     Announce: bus send --agent {agent} {project} "Working on <id>: <title>" -L task-claim.
460
4614. WORK: maw exec default -- bn show <id>, then implement the task in the workspace.
462   If this bone is part of a mission, check bus for sibling updates BEFORE starting implementation:
463     bus history {project} -n 10 -L "mission:<mission-id>" -L coord:interface
464   Adapt your approach if siblings have already defined interfaces you need to consume or conform to.
465   Add at least one progress comment: maw exec default -- bn bone comment add <id> "Progress: ...".
466
4675. STUCK CHECK: If same approach tried twice, info missing, or tool fails repeatedly — you are
468   stuck. maw exec default -- bn bone comment add <id> "Blocked: <details>".
469   bus statuses set --agent {agent} "Blocked: <short reason>".
470   bus send --agent {agent} {project} "Stuck on <id>: <reason>" -L task-blocked.
471   maw exec default -- bn bone tag <id> blocked.
472   Release: bus claims release --agent {agent} "bone://{project}/<id>".
473   Output: <promise>BLOCKED</promise>
474   Stop this cycle.
475
476{review_step}
477
478{finish_step}
479
4808. CLEANUP (always run before stopping, even on error or BLOCKED):
481   Try protocol command: edict protocol cleanup --agent {agent}
482   If it fails (exit 1 = command unavailable), fall back to manual cleanup:
483     bus statuses clear --agent {agent}
484     (bn is event-sourced — no sync needed)
485
486Key rules:
487- Exactly one small task per cycle.
488- Always finish or release before stopping.
489- If claim denied, pick something else.
490- All bus and crit commands use --agent {agent}.
491- All file operations use the absolute workspace path from maw ws create output. Do NOT cd into the workspace and stay there.
492- All bn commands: maw exec default -- bn ...
493- All crit/git commands in a workspace: maw exec $WS -- crit/git ...
494- If a tool behaves unexpectedly, report it: bus send --agent {agent} {project} "Tool issue: <details>" -L tool-issue.
495- STOP after completing one task or determining no work. Do not loop.
496- Always output <promise>COMPLETE</promise> or <promise>BLOCKED</promise> at the end.
497- RISK LABELS: Check bone risk labels before review. REVIEW={review_status}. {review_note}"#,
498            agent = self.agent,
499            project = self.project,
500            dispatched = dispatched_section,
501            dispatched_intro = dispatched_intro,
502            review_step = review_step_6,
503            finish_step = finish_step_7,
504            review_status = review_status_str,
505            review_note = review_note,
506        )
507    }
508}
509
510/// Status of a loop iteration.
511#[derive(Debug, Clone, Copy, PartialEq, Eq)]
512pub enum LoopStatus {
513    Complete,
514    Blocked,
515    Unknown,
516}
517
518/// Emit startup diagnostic for build-related environment variables.
519///
520/// Logs the effective values of CARGO_BUILD_JOBS, RUSTC_WRAPPER, and SCCACHE_DIR
521/// to stderr. These vars control cargo parallelism and caching — critical for
522/// preventing OOM when multiple agents build concurrently.
523///
524/// Warns if any of these vars are unset or empty, which could indicate that
525/// the .edict.toml [env] configuration isn't reaching the build process.
526fn emit_build_env_diagnostic(config_env: &std::collections::HashMap<String, String>) {
527    use std::env;
528
529    const BUILD_VARS: &[(&str, &str)] = &[
530        ("CARGO_BUILD_JOBS", "limits parallel rustc processes"),
531        ("RUSTC_WRAPPER", "enables sccache for build caching"),
532        ("SCCACHE_DIR", "sccache cache directory"),
533    ];
534
535    eprintln!("--- build env diagnostic ---");
536
537    let mut any_missing = false;
538    for &(var, purpose) in BUILD_VARS {
539        let effective = env::var(var).ok().filter(|v| !v.is_empty());
540        let from_config = config_env.get(var);
541
542        match (&effective, from_config) {
543            (Some(val), Some(cfg_val)) if val == cfg_val => {
544                eprintln!("  {var}={val} (from config)");
545            }
546            (Some(val), Some(cfg_val)) => {
547                // Value exists but differs from config — inherited from parent process
548                // or overridden by vessel spawn --env flag
549                eprintln!("  {var}={val} (effective; config has {cfg_val})");
550            }
551            (Some(val), None) => {
552                // Not in config but set in environment (from vessel spawn --env or parent)
553                eprintln!("  {var}={val} (from environment, not in config)");
554            }
555            (None, Some(cfg_val)) => {
556                // This shouldn't happen since we just applied config env, but
557                // log it for completeness
558                eprintln!("  {var}=<unset> (config has {cfg_val}, failed to apply?)");
559                any_missing = true;
560            }
561            (None, None) => {
562                eprintln!("  {var}=<unset> — warning: {purpose}");
563                any_missing = true;
564            }
565        }
566    }
567
568    if any_missing {
569        eprintln!(
570            "  ⚠ Some build env vars are unset. Consider adding them to .edict.toml [env]"
571        );
572        eprintln!(
573            "    to prevent OOM from unthrottled parallel builds in multi-agent setups."
574        );
575    }
576
577    eprintln!("--- end build env diagnostic ---");
578}
579
580/// Load config from .edict.toml (or legacy .botbox.toml/.botbox.json) using the canonical priority
581/// (root TOML > ws/default TOML > root JSON > ws/default JSON).
582fn load_config(root: &Path) -> anyhow::Result<Config> {
583    let (config_path, _config_dir) = crate::config::find_config_in_project(root)?;
584    Config::load(&config_path)
585}
586
587/// Run an agent with rate limit fallback across the model pool.
588///
589/// Tries each model in the pool sequentially. If a model returns a rate limit error (429),
590/// logs a warning and tries the next model. Returns error only when all models are exhausted
591/// or a non-rate-limit error occurs.
592fn run_agent_with_fallback(
593    prompt: &str,
594    model_pool: &[String],
595    timeout: u64,
596) -> anyhow::Result<String> {
597    for (i, model) in model_pool.iter().enumerate() {
598        if model_pool.len() > 1 {
599            eprintln!("Trying model {}/{}: {}", i + 1, model_pool.len(), model);
600        }
601        match try_run_agent(prompt, model, timeout) {
602            Ok(output) => {
603                if is_rate_limit_output(&output) {
604                    eprintln!(
605                        "Rate limited on {} (detected in output), trying next model...",
606                        model
607                    );
608                    crate::telemetry::metrics::counter(
609                        "edict.worker.rate_limit_retries_total",
610                        1,
611                        &[("model", model)],
612                    );
613                    continue;
614                }
615                // Empty or near-empty output means the model hung/crashed without
616                // producing useful work (e.g., Pi killed a hung Gemini process).
617                // Try the next model if available.
618                if output.trim().is_empty() && i + 1 < model_pool.len() {
619                    eprintln!(
620                        "Empty output from {} (process likely hung), trying next model...",
621                        model
622                    );
623                    continue;
624                }
625                return Ok(output);
626            }
627            Err(e) => {
628                let err_str = format!("{e:#}");
629                if (is_rate_limit_error(&err_str) || err_str.contains("exited with code"))
630                    && i + 1 < model_pool.len()
631                {
632                    eprintln!(
633                        "Failed on {} ({}), trying next model...",
634                        model,
635                        err_str.lines().next().unwrap_or("error")
636                    );
637                    continue;
638                }
639                return Err(e);
640            }
641        }
642    }
643    anyhow::bail!(
644        "All {} models in pool exhausted (rate limited)",
645        model_pool.len()
646    )
647}
648
649/// Check if output text indicates a rate limit error.
650fn is_rate_limit_output(output: &str) -> bool {
651    let lower = output.to_lowercase();
652    lower.contains("429")
653        && (lower.contains("rate limit")
654            || lower.contains("rate_limit")
655            || lower.contains("quota")
656            || lower.contains("exhausted your capacity")
657            || lower.contains("resource_exhausted"))
658}
659
660/// Check if an error message indicates a rate limit error.
661fn is_rate_limit_error(err: &str) -> bool {
662    let lower = err.to_lowercase();
663    lower.contains("429")
664        || lower.contains("rate limit")
665        || lower.contains("rate_limit")
666        || lower.contains("quota")
667        || lower.contains("resource_exhausted")
668}
669
670/// Run an agent via `edict run agent` (Pi by default).
671///
672/// Supports `provider/model:thinking` syntax for thinking levels.
673/// Echoes output to stderr for visibility in vessel while capturing stdout for parsing.
674fn try_run_agent(prompt: &str, model: &str, timeout: u64) -> anyhow::Result<String> {
675    use std::io::{BufRead, BufReader};
676    use std::process::{Command, Stdio};
677
678    let timeout_string = timeout.to_string();
679    let mut args = vec!["run", "agent", prompt, "-t", &timeout_string];
680
681    // Pass the full model string (e.g. "anthropic/claude-sonnet-4-6:medium") — Pi handles :suffix natively
682    if !model.is_empty() {
683        args.push("-m");
684        args.push(model);
685    }
686
687    let mut child = Command::new("edict")
688        .args(&args)
689        .stdin(Stdio::null())
690        .stdout(Stdio::piped())
691        .stderr(Stdio::inherit())
692        .spawn()
693        .context("spawning edict run agent")?;
694
695    let stdout = child.stdout.take().context("capturing stdout")?;
696    let reader = BufReader::new(stdout);
697
698    let mut output = String::new();
699    for line in reader.lines() {
700        let line = line.context("reading line from edict run agent")?;
701        // Echo to stderr for visibility in vessel
702        eprintln!("{}", line);
703        output.push_str(&line);
704        output.push('\n');
705    }
706
707    let status = child.wait().context("waiting for edict run agent")?;
708    if status.success() {
709        Ok(output)
710    } else {
711        let code = status.code().unwrap_or(-1);
712        anyhow::bail!("edict run agent exited with code {code}")
713    }
714}
715
716/// Parse completion signal from Claude output.
717fn parse_completion_signal(output: &str) -> LoopStatus {
718    if output.contains("<promise>COMPLETE</promise>") {
719        LoopStatus::Complete
720    } else if output.contains("<promise>BLOCKED</promise>") {
721        LoopStatus::Blocked
722    } else {
723        LoopStatus::Unknown
724    }
725}
726
727/// Register cleanup handlers for SIGINT/SIGTERM.
728fn register_cleanup_handlers(agent: &str, project: &str) {
729    let agent = agent.to_string();
730    let project = project.to_string();
731
732    ctrlc::set_handler(move || {
733        eprintln!("Received interrupt signal, cleaning up...");
734        let _ = cleanup(&agent, &project);
735        process::exit(0);
736    })
737    .expect("Error setting Ctrl-C handler");
738}
739
740/// Cleanup: release claims, clear status.
741fn cleanup(agent: &str, project: &str) -> anyhow::Result<()> {
742    eprintln!("Cleaning up...");
743
744    // All subprocess spawns below use .new_process_group() so they run in their
745    // own process group and survive the SIGTERM that triggered this cleanup
746    // (vessel kill sends SIGTERM to the parent's process group, which would
747    // otherwise kill these children before they complete).
748
749    // Reset orphaned doing bones
750    let result = Tool::new("bn")
751        .args(&["list", "--state", "doing", "--assignee", agent, "--json"])
752        .in_workspace("default")?
753        .new_process_group()
754        .run();
755
756    if let Ok(output) = result
757        && let Ok(bones) = output.parse_json::<Vec<serde_json::Value>>()
758    {
759        for bone in bones {
760            if let Some(id) = bone.get("id").and_then(|v| v.as_str()) {
761                // bn doesn't have an "undo" command — just add a comment noting the orphan
762                let _ = Tool::new("bn")
763                    .args(&[
764                        "bone",
765                        "comment",
766                        "add",
767                        id,
768                        &format!("Worker {agent} exited without completing. Needs reassignment."),
769                    ])
770                    .in_workspace("default")?
771                    .new_process_group()
772                    .run();
773                eprintln!("Noted orphaned bone {id}");
774            }
775        }
776    }
777
778    // Sign off on bus
779    let _ = Tool::new("bus")
780        .args(&[
781            "send",
782            "--agent",
783            agent,
784            project,
785            &format!("Agent {agent} signing off."),
786            "-L",
787            "agent-idle",
788        ])
789        .new_process_group()
790        .run();
791
792    // Clear status
793    let _ = Tool::new("bus")
794        .args(&["statuses", "clear", "--agent", agent])
795        .new_process_group()
796        .run();
797
798    // Release agent claim
799    let _ = Tool::new("bus")
800        .args(&[
801            "claims",
802            "release",
803            "--agent",
804            agent,
805            &format!("agent://{agent}"),
806        ])
807        .new_process_group()
808        .run();
809
810    // Release all claims
811    let _ = Tool::new("bus")
812        .args(&["claims", "release", "--agent", agent, "--all"])
813        .new_process_group()
814        .run();
815
816    // bn is event-sourced — no sync step needed
817
818    eprintln!("Cleanup complete for {agent}.");
819    Ok(())
820}
821
822/// Run the worker loop.
823pub fn run_worker_loop(
824    project_root: Option<PathBuf>,
825    agent: Option<String>,
826    model: Option<String>,
827) -> anyhow::Result<()> {
828    let worker = WorkerLoop::new(project_root, agent, model)?;
829
830    // Announce startup on bus (survives vessel log eviction)
831    let bone_info = worker
832        .dispatched_bone
833        .as_deref()
834        .unwrap_or("(triage)");
835    let ws_info = worker
836        .dispatched_workspace
837        .as_deref()
838        .unwrap_or("(none)");
839    let _ = Tool::new("bus")
840        .args(&[
841            "send",
842            "--agent",
843            &worker.agent,
844            &worker.project,
845            &format!("Worker started: {bone_info} in ws/{ws_info}"),
846            "-L",
847            "worker-lifecycle",
848        ])
849        .run();
850
851    let status = worker.run_once();
852
853    // Announce exit on bus regardless of outcome
854    let exit_msg = match &status {
855        Ok(LoopStatus::Complete) => format!("Worker exited OK: {bone_info} COMPLETE"),
856        Ok(LoopStatus::Blocked) => format!("Worker exited OK: {bone_info} BLOCKED"),
857        Ok(LoopStatus::Unknown) => format!("Worker exited: {bone_info} (no completion signal)"),
858        Err(e) => format!("Worker exited ERROR: {bone_info} — {e}"),
859    };
860    let _ = Tool::new("bus")
861        .args(&[
862            "send",
863            "--agent",
864            &worker.agent,
865            &worker.project,
866            &exit_msg,
867            "-L",
868            "worker-lifecycle",
869        ])
870        .run();
871
872    match status? {
873        LoopStatus::Complete => {
874            eprintln!("Worker loop completed successfully");
875            Ok(())
876        }
877        LoopStatus::Blocked => {
878            eprintln!("Worker loop blocked");
879            Ok(())
880        }
881        LoopStatus::Unknown => {
882            eprintln!("Warning: completion signal not found in output");
883            Ok(())
884        }
885    }
886}
887
888#[cfg(test)]
889mod tests {
890    use super::*;
891
892    #[test]
893    fn parse_completion_signal_complete() {
894        let output = "some text\n<promise>COMPLETE</promise>\nmore text";
895        assert_eq!(parse_completion_signal(output), LoopStatus::Complete);
896    }
897
898    #[test]
899    fn parse_completion_signal_blocked() {
900        let output = "error occurred\n<promise>BLOCKED</promise>";
901        assert_eq!(parse_completion_signal(output), LoopStatus::Blocked);
902    }
903
904    #[test]
905    fn parse_completion_signal_missing() {
906        let output = "no signal here";
907        assert_eq!(parse_completion_signal(output), LoopStatus::Unknown);
908    }
909
910    #[test]
911    fn build_prompt_contains_agent_identity() {
912        unsafe {
913            std::env::set_var("EDICT_BONE", "");
914            std::env::set_var("EDICT_WORKSPACE", "");
915        }
916
917        let worker = WorkerLoop {
918            project_root: PathBuf::from("/test"),
919            agent: "test-worker".to_string(),
920            project: "testproject".to_string(),
921            model_pool: vec!["haiku".to_string()],
922            timeout: 900,
923            review_enabled: true,
924            critical_approvers: vec![],
925            dispatched_bone: None,
926            dispatched_workspace: None,
927            dispatched_mission: None,
928            dispatched_siblings: None,
929            dispatched_mission_outcome: None,
930            dispatched_file_hints: None,
931        };
932
933        let prompt = worker.build_prompt();
934        assert!(prompt.contains("test-worker"));
935        assert!(prompt.contains("testproject"));
936        assert!(prompt.contains("RESUME CHECK"));
937        assert!(prompt.contains("INBOX"));
938        assert!(prompt.contains("TRIAGE"));
939    }
940
941    #[test]
942    fn build_prompt_dispatched_fast_path() {
943        unsafe {
944            std::env::set_var("EDICT_BONE", "bd-test");
945            std::env::set_var("EDICT_WORKSPACE", "test-ws");
946        }
947
948        let worker = WorkerLoop {
949            project_root: PathBuf::from("/test"),
950            agent: "test-worker".to_string(),
951            project: "testproject".to_string(),
952            model_pool: vec!["haiku".to_string()],
953            timeout: 900,
954            review_enabled: true,
955            critical_approvers: vec![],
956            dispatched_bone: Some("bd-test".to_string()),
957            dispatched_workspace: Some("test-ws".to_string()),
958            dispatched_mission: None,
959            dispatched_siblings: None,
960            dispatched_mission_outcome: None,
961            dispatched_file_hints: None,
962        };
963
964        let prompt = worker.build_prompt();
965        assert!(prompt.contains("DISPATCHED WORKER — FAST PATH"));
966        assert!(prompt.contains("Pre-assigned bone: bd-test"));
967        assert!(prompt.contains("Pre-assigned workspace: test-ws"));
968        assert!(prompt.contains("Skip steps 0 (RESUME CHECK), 1 (INBOX), and 2 (TRIAGE)"));
969    }
970
971    #[test]
972    fn build_prompt_contains_all_protocol_commands() {
973        unsafe {
974            std::env::set_var("EDICT_BONE", "");
975            std::env::set_var("EDICT_WORKSPACE", "");
976        }
977
978        let worker = WorkerLoop {
979            project_root: PathBuf::from("/test"),
980            agent: "test-worker".to_string(),
981            project: "testproject".to_string(),
982            model_pool: vec!["haiku".to_string()],
983            timeout: 900,
984            review_enabled: true,
985            critical_approvers: vec![],
986            dispatched_bone: None,
987            dispatched_workspace: None,
988            dispatched_mission: None,
989            dispatched_siblings: None,
990            dispatched_mission_outcome: None,
991            dispatched_file_hints: None,
992        };
993
994        let prompt = worker.build_prompt();
995
996        // All 5 protocol commands must be referenced in the worker prompt
997        assert!(
998            prompt.contains("edict protocol resume"),
999            "worker prompt must reference 'edict protocol resume'"
1000        );
1001        assert!(
1002            prompt.contains("edict protocol start"),
1003            "worker prompt must reference 'edict protocol start'"
1004        );
1005        assert!(
1006            prompt.contains("edict protocol review"),
1007            "worker prompt must reference 'edict protocol review'"
1008        );
1009        assert!(
1010            prompt.contains("edict protocol finish"),
1011            "worker prompt must reference 'edict protocol finish'"
1012        );
1013        assert!(
1014            prompt.contains("edict protocol cleanup"),
1015            "worker prompt must reference 'edict protocol cleanup'"
1016        );
1017    }
1018
1019    #[test]
1020    fn build_prompt_review_disabled() {
1021        unsafe {
1022            std::env::set_var("EDICT_BONE", "");
1023            std::env::set_var("EDICT_WORKSPACE", "");
1024        }
1025
1026        let worker = WorkerLoop {
1027            project_root: PathBuf::from("/test"),
1028            agent: "test-worker".to_string(),
1029            project: "testproject".to_string(),
1030            model_pool: vec!["haiku".to_string()],
1031            timeout: 900,
1032            review_enabled: false,
1033            critical_approvers: vec![],
1034            dispatched_bone: None,
1035            dispatched_workspace: None,
1036            dispatched_mission: None,
1037            dispatched_siblings: None,
1038            dispatched_mission_outcome: None,
1039            dispatched_file_hints: None,
1040        };
1041
1042        let prompt = worker.build_prompt();
1043        assert!(prompt.contains("REVIEW is disabled"));
1044        assert!(prompt.contains("Skip code review"));
1045        assert!(prompt.contains("REVIEW=false"));
1046    }
1047
1048    #[test]
1049    fn build_prompt_contains_protocol_fallback_wording() {
1050        unsafe {
1051            std::env::set_var("EDICT_BONE", "");
1052            std::env::set_var("EDICT_WORKSPACE", "");
1053        }
1054
1055        let worker = WorkerLoop {
1056            project_root: PathBuf::from("/test"),
1057            agent: "test-worker".to_string(),
1058            project: "testproject".to_string(),
1059            model_pool: vec!["haiku".to_string()],
1060            timeout: 900,
1061            review_enabled: true,
1062            critical_approvers: vec![],
1063            dispatched_bone: None,
1064            dispatched_workspace: None,
1065            dispatched_mission: None,
1066            dispatched_siblings: None,
1067            dispatched_mission_outcome: None,
1068            dispatched_file_hints: None,
1069        };
1070
1071        let prompt = worker.build_prompt();
1072
1073        // Verify fallback wording is present for protocol transitions
1074        // This prevents silent regressions where protocol fallback guidance is removed
1075        assert!(
1076            prompt.contains("If it fails (exit 1 = command unavailable), fall back"),
1077            "worker prompt must contain protocol fallback wording for unavailable commands"
1078        );
1079
1080        // Verify transition hooks are explicitly documented
1081        assert!(
1082            prompt.contains("Try protocol command:"),
1083            "worker prompt must guide agents to try protocol commands first"
1084        );
1085
1086        // Verify at least one complete fallback example path is present
1087        // (e.g., for resume, start, review, finish, cleanup transitions)
1088        let fallback_patterns = [
1089            ("protocol resume", "resume check"),
1090            ("protocol start", "start"),
1091            ("protocol review", "review request"),
1092            ("protocol finish", "finish"),
1093            ("protocol cleanup", "cleanup"),
1094        ];
1095
1096        for (protocol_cmd, step_name) in fallback_patterns.iter() {
1097            assert!(
1098                prompt.contains(protocol_cmd),
1099                "worker prompt must reference '{}' in {} step",
1100                protocol_cmd,
1101                step_name
1102            );
1103        }
1104    }
1105
1106    #[test]
1107    fn rate_limit_detection_output() {
1108        assert!(is_rate_limit_output("Error 429: rate limit exceeded"));
1109        assert!(is_rate_limit_output("HTTP 429 - quota exceeded"));
1110        assert!(is_rate_limit_output("429 resource_exhausted"));
1111        assert!(is_rate_limit_output(
1112            "Got 429: You have exhausted your capacity"
1113        ));
1114        assert!(!is_rate_limit_output("Everything is fine"));
1115        assert!(!is_rate_limit_output("Error 500: server error"));
1116        // 429 alone without rate limit keywords should not match
1117        assert!(!is_rate_limit_output("429"));
1118    }
1119
1120    #[test]
1121    fn rate_limit_detection_error() {
1122        assert!(is_rate_limit_error("429 Too Many Requests"));
1123        assert!(is_rate_limit_error("rate limit exceeded"));
1124        assert!(is_rate_limit_error("quota exhausted"));
1125        assert!(is_rate_limit_error("resource_exhausted"));
1126        assert!(!is_rate_limit_error("normal error"));
1127        assert!(!is_rate_limit_error("exit code 1"));
1128    }
1129
1130    #[test]
1131    fn build_env_diagnostic_does_not_panic() {
1132        // The diagnostic function should handle all combinations of
1133        // set/unset vars without panicking. It writes to stderr only.
1134        let mut config_env = std::collections::HashMap::new();
1135        config_env.insert("CARGO_BUILD_JOBS".to_string(), "2".to_string());
1136        // RUSTC_WRAPPER not in config, SCCACHE_DIR not in config
1137
1138        // Should not panic regardless of env state
1139        emit_build_env_diagnostic(&config_env);
1140    }
1141
1142    #[test]
1143    fn build_env_diagnostic_with_empty_config() {
1144        let config_env = std::collections::HashMap::new();
1145        // All vars unset + empty config = should emit warnings without panic
1146        emit_build_env_diagnostic(&config_env);
1147    }
1148}
edict/commands/worker_loop.rs

edict/commands/
worker_loop.rs