devboy_format_pipeline/
tool_defaults.rs

1//! Paper 3 — built-in default `ToolValueModel`s for selected common
2//! tools by corpus volume.
3//!
4//! Anchored on `docs/research/paper3_corpus_findings.md` (P-3-01).
5//! We ship defaults for a curated subset of the highest-volume tools
6//! (the canonical patterns from §Real-world patterns); the corpus has
7//! more tools above the 100-session threshold, but the long tail
8//! either reuses one of the shipped defaults via `[tools."*"]` or is
9//! overridden per-installation. Users can override any seeded
10//! annotation through `[tools.<name>]` in `pipeline_config.toml`
11//! (Paper 3 §Provider extensibility).
12//!
13//! The numbers below are *priors* — `tune analyze` will refine them
14//! against real telemetry, the same way Paper 2's adaptive tuner
15//! refines the encoder profiles.
16
17use std::collections::BTreeMap;
18
19use devboy_core::{CostModel, FollowUpLink, SideEffectClass, ToolValueModel, ValueClass};
20
21/// Returns the seeded `[tools.*]` map every layered pipeline starts
22/// with. `merge_right_wins` lets the user's TOML overrides win over
23/// these defaults.
24pub fn default_tool_value_models() -> BTreeMap<String, ToolValueModel> {
25    let mut m = BTreeMap::new();
26
27    // ─── Read — workhorse pattern (50 675 calls, 1 363 sessions) ─────
28    // Median 2.5 kB, p99 43 kB. Critical: file content is non-negotiable
29    // for code-edit work. Mutation hook (already wired in P-203-04)
30    // invalidates on Edit/Write/MultiEdit/NotebookEdit.
31    m.insert(
32        "Read".into(),
33        ToolValueModel {
34            value_class: ValueClass::Critical,
35            cost_model: CostModel {
36                typical_kb: 2.5,
37                max_kb: Some(43.0),
38                latency_ms_p50: Some(50),
39                ..CostModel::default()
40            },
41            follow_up: vec![FollowUpLink {
42                tool: "Read".into(),
43                probability: 0.45,
44                ..FollowUpLink::default()
45            }],
46            // Pure: same path → same bytes (mutation hook invalidates).
47            side_effect_class: SideEffectClass::Pure,
48            ..ToolValueModel::default()
49        },
50    );
51
52    // ─── Edit / Write / MultiEdit — mutating tools ───────────────────
53    // Their responses are tiny (median 162 / 137 / negligible bytes).
54    // Their value is in *invalidating* the read cache — handled by the
55    // mutation hook, but we declare `invalidates` so cross-tool
56    // invalidation in P-3-07 picks them up uniformly.
57    m.insert(
58        "Edit".into(),
59        ToolValueModel {
60            value_class: ValueClass::Supporting,
61            cost_model: CostModel {
62                typical_kb: 0.2,
63                max_kb: Some(1.0),
64                latency_ms_p50: Some(20),
65                ..CostModel::default()
66            },
67            follow_up: vec![
68                FollowUpLink {
69                    tool: "Bash".into(),
70                    probability: 0.27,
71                    ..FollowUpLink::default()
72                },
73                FollowUpLink {
74                    tool: "Read".into(),
75                    probability: 0.14,
76                    ..FollowUpLink::default()
77                },
78            ],
79            invalidates: vec!["Read".into(), "Grep".into()],
80            // MutatesLocal: never speculate — re-running an Edit would
81            // double-apply the patch.
82            side_effect_class: SideEffectClass::MutatesLocal,
83            ..ToolValueModel::default()
84        },
85    );
86    m.insert(
87        "Write".into(),
88        ToolValueModel {
89            value_class: ValueClass::Supporting,
90            cost_model: CostModel {
91                typical_kb: 0.2,
92                ..CostModel::default()
93            },
94            invalidates: vec!["Read".into(), "Grep".into(), "Glob".into()],
95            side_effect_class: SideEffectClass::MutatesLocal,
96            ..ToolValueModel::default()
97        },
98    );
99    m.insert(
100        "MultiEdit".into(),
101        ToolValueModel {
102            value_class: ValueClass::Supporting,
103            cost_model: CostModel {
104                typical_kb: 0.2,
105                ..CostModel::default()
106            },
107            invalidates: vec!["Read".into(), "Grep".into()],
108            side_effect_class: SideEffectClass::MutatesLocal,
109            ..ToolValueModel::default()
110        },
111    );
112    m.insert(
113        "NotebookEdit".into(),
114        ToolValueModel {
115            value_class: ValueClass::Supporting,
116            cost_model: CostModel {
117                typical_kb: 0.5,
118                ..CostModel::default()
119            },
120            invalidates: vec!["Read".into()],
121            side_effect_class: SideEffectClass::MutatesLocal,
122            ..ToolValueModel::default()
123        },
124    );
125
126    // ─── Bash — generic shell (110 930 calls — the most common tool) ─
127    // Critical for verification (cargo test, git, etc.) but median
128    // response is tiny (223 B). No follow-up annotation: corpus shows
129    // Bash → * is too varied to prefetch usefully.
130    m.insert(
131        "Bash".into(),
132        ToolValueModel {
133            value_class: ValueClass::Critical,
134            cost_model: CostModel {
135                typical_kb: 0.2,
136                max_kb: Some(9.0),
137                latency_ms_p50: Some(200),
138                ..CostModel::default()
139            },
140            // Indeterminate by design — `git status` is read-only,
141            // `rm -rf` is catastrophic. Sub-classification is its own
142            // research direction; until then, never speculate.
143            side_effect_class: SideEffectClass::Indeterminate,
144            ..ToolValueModel::default()
145        },
146    );
147
148    // ─── Grep — find-then-fix loop core (16 718 calls) ───────────────
149    // 1 120 (Grep → Edit) + 1 671 (Edit → Grep) edges. Strong prefetch
150    // signal: after Grep, prefetch top-3 file contents as Read.
151    m.insert(
152        "Grep".into(),
153        ToolValueModel {
154            value_class: ValueClass::Critical,
155            cost_model: CostModel {
156                typical_kb: 0.3,
157                max_kb: Some(10.5),
158                latency_ms_p50: Some(80),
159                ..CostModel::default()
160            },
161            follow_up: vec![
162                FollowUpLink {
163                    tool: "Read".into(),
164                    probability: 0.35,
165                    projection: Some("path".into()),
166                    projection_arg: Some("file_path".into()),
167                },
168                // Edit follow-up is informational only — never
169                // speculatively executed (MutatesLocal blocks it).
170                FollowUpLink {
171                    tool: "Edit".into(),
172                    probability: 0.07,
173                    projection: Some("path".into()),
174                    projection_arg: Some("file_path".into()),
175                },
176                FollowUpLink {
177                    tool: "Grep".into(),
178                    probability: 0.39,
179                    ..FollowUpLink::default()
180                },
181            ],
182            // Pure under file-mutation hook: same query → same matches
183            // until Edit/Write fires.
184            side_effect_class: SideEffectClass::Pure,
185            ..ToolValueModel::default()
186        },
187    );
188
189    // ─── Glob — bulk listing → inspect-each (6 202 calls) ────────────
190    // Glob → Read 2 007 edges, Glob → Grep 775. Speculative prefetch
191    // of top-N results when intent is "where is X used".
192    m.insert(
193        "Glob".into(),
194        ToolValueModel {
195            value_class: ValueClass::Supporting,
196            cost_model: CostModel {
197                typical_kb: 0.2,
198                max_kb: Some(16.6),
199                latency_ms_p50: Some(60),
200                ..CostModel::default()
201            },
202            follow_up: vec![
203                FollowUpLink {
204                    tool: "Read".into(),
205                    probability: 0.32,
206                    projection: Some("match_path".into()),
207                    projection_arg: Some("file_path".into()),
208                },
209                FollowUpLink {
210                    tool: "Grep".into(),
211                    probability: 0.13,
212                    // Grep needs a query string the planner cannot
213                    // synthesise from a path — not speculatable, kept
214                    // as informational hint only.
215                    ..FollowUpLink::default()
216                },
217                FollowUpLink {
218                    tool: "Glob".into(),
219                    probability: 0.41,
220                    ..FollowUpLink::default()
221                },
222            ],
223            side_effect_class: SideEffectClass::ReadOnly,
224            ..ToolValueModel::default()
225        },
226    );
227
228    // ─── WebSearch / WebFetch — search → resolve chain (1 081 edges) ─
229    // 6 fields surface in our corpus; only `title`/`url` are reliably
230    // cited downstream — drop snippets first under tight budget.
231    m.insert(
232        "WebSearch".into(),
233        ToolValueModel {
234            value_class: ValueClass::Supporting,
235            cost_model: CostModel {
236                typical_kb: 3.1,
237                max_kb: Some(7.2),
238                latency_ms_p50: Some(900),
239                freshness_ttl_s: Some(3600),
240                ..CostModel::default()
241            },
242            // Read-only with TTL — same query → near-identical results
243            // for ~1 hour (freshness_ttl_s).
244            side_effect_class: SideEffectClass::ReadOnly,
245            rate_limit_host: None,
246            follow_up: vec![FollowUpLink {
247                tool: "WebFetch".into(),
248                probability: 0.65,
249                projection: Some("url".into()),
250                projection_arg: Some("url".into()),
251            }],
252            field_groups: {
253                let mut g = BTreeMap::new();
254                g.insert(
255                    "must_have".into(),
256                    devboy_core::FieldGroup {
257                        fields: vec!["title".into(), "url".into()],
258                        estimated_value: 1.0,
259                        default_include: true,
260                    },
261                );
262                g.insert(
263                    "nice_to_have".into(),
264                    devboy_core::FieldGroup {
265                        fields: vec!["snippet".into()],
266                        estimated_value: 0.3,
267                        default_include: false,
268                    },
269                );
270                g
271            },
272            ..ToolValueModel::default()
273        },
274    );
275    m.insert(
276        "WebFetch".into(),
277        ToolValueModel {
278            value_class: ValueClass::Supporting,
279            cost_model: CostModel {
280                typical_kb: 1.2,
281                max_kb: Some(24.0),
282                latency_ms_p50: Some(800),
283                freshness_ttl_s: Some(900),
284                ..CostModel::default()
285            },
286            side_effect_class: SideEffectClass::ReadOnly,
287            rate_limit_host: None,
288            ..ToolValueModel::default()
289        },
290    );
291
292    // ─── Task management noise (audit_only) ──────────────────────────
293    // `TaskUpdate` median 23 B, `TodoWrite` 160 B, `TaskCreate` 78 B.
294    // Excluded from budget accounting entirely (Paper 3 §6).
295    for name in [
296        "TaskUpdate",
297        "TaskCreate",
298        "TaskGet",
299        "TaskList",
300        "TodoWrite",
301    ] {
302        m.insert(name.into(), ToolValueModel::audit_only());
303    }
304
305    // ─── ToolSearch — fail-fast loop ─────────────────────────────────
306    // 50%+ of repeated calls return zero bytes; planner gives up after
307    // two empty calls and emits a "tool not found" note instead.
308    m.insert(
309        "ToolSearch".into(),
310        ToolValueModel {
311            value_class: ValueClass::Supporting,
312            cost_model: CostModel {
313                typical_kb: 0.0,
314                max_kb: Some(0.1),
315                ..CostModel::default()
316            },
317            fail_fast_after_n: Some(2),
318            // Read-only metadata lookup — but typical_kb is 0, so the
319            // planner's cost-clamp puts it at 1 token; speculation buys
320            // little and the fail-fast circuit is the real win here.
321            side_effect_class: SideEffectClass::ReadOnly,
322            ..ToolValueModel::default()
323        },
324    );
325
326    // ─── Agent / Task subagent — long, expensive, value-rich ─────────
327    // Median 6.6 kB, p99 23.7 kB. `Supporting` because the LLM can
328    // proceed without it but accuracy drops.
329    m.insert(
330        "Agent".into(),
331        ToolValueModel {
332            value_class: ValueClass::Supporting,
333            cost_model: CostModel {
334                typical_kb: 6.5,
335                max_kb: Some(23.7),
336                latency_ms_p50: Some(60_000),
337                ..CostModel::default()
338            },
339            // Sub-agent runs arbitrary tools — assume Indeterminate
340            // until we know its inner side-effect profile.
341            side_effect_class: SideEffectClass::Indeterminate,
342            ..ToolValueModel::default()
343        },
344    );
345
346    m
347}
348
349#[cfg(test)]
350mod tests {
351    use super::*;
352
353    #[test]
354    fn defaults_cover_top_tools_from_corpus() {
355        let m = default_tool_value_models();
356        for required in [
357            "Read",
358            "Edit",
359            "Write",
360            "Bash",
361            "Grep",
362            "Glob",
363            "WebSearch",
364            "WebFetch",
365            "TaskUpdate",
366            "TodoWrite",
367            "ToolSearch",
368            "Agent",
369        ] {
370            assert!(m.contains_key(required), "missing default for {required}");
371        }
372    }
373
374    #[test]
375    fn audit_only_tools_are_excluded_from_budget() {
376        let m = default_tool_value_models();
377        for name in ["TaskUpdate", "TaskCreate", "TodoWrite"] {
378            assert!(
379                m[name].excluded_from_budget(),
380                "{name} should be excluded_from_budget"
381            );
382        }
383    }
384
385    #[test]
386    fn read_is_critical_with_typical_kb_anchored_on_corpus() {
387        let m = default_tool_value_models();
388        let read = &m["Read"];
389        assert_eq!(read.value_class, ValueClass::Critical);
390        assert_eq!(read.cost_model.typical_kb, 2.5);
391    }
392
393    #[test]
394    fn grep_followup_includes_read_and_edit_with_path_projection() {
395        let m = default_tool_value_models();
396        let fu = &m["Grep"].follow_up;
397        let read_link = fu.iter().find(|l| l.tool == "Read").unwrap();
398        assert_eq!(read_link.projection.as_deref(), Some("path"));
399        let edit_link = fu.iter().find(|l| l.tool == "Edit").unwrap();
400        assert_eq!(edit_link.projection.as_deref(), Some("path"));
401    }
402
403    #[test]
404    fn web_search_drops_snippets_first_under_budget() {
405        let m = default_tool_value_models();
406        let groups = &m["WebSearch"].field_groups;
407        assert!(groups["must_have"].default_include);
408        assert!(!groups["nice_to_have"].default_include);
409    }
410
411    /// **Safety invariant**: `Bash` and `Agent` are
412    /// `SideEffectClass::Indeterminate` by design — `git status`
413    /// behaves as a read but `rm -rf` is catastrophic; sub-agents
414    /// run arbitrary tools. Speculation must **never** dispatch
415    /// either. Edit / Write / MultiEdit / NotebookEdit must stay
416    /// `MutatesLocal`. Anything that loosens this invariant should
417    /// trip the test.
418    #[test]
419    fn never_speculatable_safety_invariant() {
420        let m = default_tool_value_models();
421        for tool in [
422            "Bash",
423            "Agent",
424            "Edit",
425            "Write",
426            "MultiEdit",
427            "NotebookEdit",
428        ] {
429            let model = m.get(tool).unwrap_or_else(|| panic!("{tool} missing"));
430            assert!(
431                !model.is_speculatable(),
432                "SAFETY: {tool} (side_effect={:?}) must never be speculatable; \
433                 a regression here can lead to double-applied writes / shell \
434                 commands re-run",
435                model.side_effect_class
436            );
437        }
438    }
439
440    /// Counterpart to the safety invariant — every Pure / ReadOnly
441    /// tool stays speculatable. Catches accidental flips of
442    /// `side_effect_class` to `Indeterminate` on the wrong tool.
443    #[test]
444    fn pure_and_read_only_tools_are_speculatable() {
445        let m = default_tool_value_models();
446        for tool in [
447            "Read",
448            "Grep",
449            "Glob",
450            "WebSearch",
451            "WebFetch",
452            "ToolSearch",
453        ] {
454            let model = m.get(tool).unwrap_or_else(|| panic!("{tool} missing"));
455            assert!(
456                model.is_speculatable(),
457                "{tool} (side_effect={:?}) should remain speculatable",
458                model.side_effect_class
459            );
460        }
461    }
462
463    #[test]
464    fn tool_search_has_fail_fast() {
465        let m = default_tool_value_models();
466        assert_eq!(m["ToolSearch"].fail_fast_after_n, Some(2));
467    }
468
469    #[test]
470    fn mutating_tools_invalidate_read_cache() {
471        let m = default_tool_value_models();
472        for name in ["Edit", "Write", "MultiEdit"] {
473            assert!(
474                m[name].invalidates.iter().any(|t| t == "Read"),
475                "{name} should invalidate Read"
476            );
477        }
478    }
479}
devboy_format_pipeline/tool_defaults.rs

devboy_format_pipeline/
tool_defaults.rs