Skip to main content

devboy_format_pipeline/
projection.rs

1//! Paper 3 — argument projection for speculative pre-fetch.
2//!
3//! When the planner picks a `FollowUpLink` (e.g. `Glob → Read`), the
4//! host needs concrete `args` to dispatch the prefetch. Two paths
5//! produce them:
6//!
7//! 1. **Provider override** — `ToolEnricher::project_args` returns
8//!    a vendor-specific projection (e.g. GitLab knows that
9//!    `get_merge_requests → get_merge_request_discussions` projects
10//!    `id` to `merge_request_id`).
11//! 2. **Generic fallback** (this module) — when the user (or the
12//!    built-in defaults in `tool_defaults`) annotates a follow-up
13//!    with both `projection: Some("path")` and
14//!    `projection_arg: Some("file_path")`, the generic resolver
15//!    extracts every `path` from the previous response and emits one
16//!    prefetch request per match, capped at `max_per_link`.
17//!
18//! Both paths return `Vec<Value>` — one JSON object per prefetch
19//! request the host should dispatch. Empty `Vec` means "nothing to
20//! prefetch from this link".
21
22use devboy_core::FollowUpLink;
23use serde_json::{Map, Value};
24
25/// Hard cap on prefetches generated from a single follow-up link.
26/// Mirrors the corpus finding that top-3 prefetch covers > 80% of
27/// cited follow-ups (`paper3_corpus_findings.md` §Glob → Read,
28/// §Grep → Read). Higher would just waste rate-limit budget.
29pub const MAX_PROJECTIONS_PER_LINK: usize = 3;
30
31/// Built-in projection extractors for the canonical Paper-3 chains.
32///
33/// Returns a vector of `args` JSON objects, ready to feed into the
34/// follow-up tool's `tools/call`. The host should also enforce the
35/// per-turn `max_parallel_prefetches` cap on top of this.
36///
37/// Resolution order:
38///
39/// 1. Built-in match (`Glob → Read`, `Grep → Read`,
40///    `WebSearch → WebFetch`, …).
41/// 2. Generic fallback using `link.projection` + `link.projection_arg`
42///    on the JSON tree.
43/// 3. Empty `Vec` if neither path produced anything.
44pub fn extract_args(prev_tool: &str, prev_result: &Value, link: &FollowUpLink) -> Vec<Value> {
45    if let Some(args) = builtin_extract(prev_tool, prev_result, link) {
46        return args;
47    }
48    generic_extract(prev_result, link)
49}
50
51/// Hard-coded extractors for built-in tool chains. Returning `Some`
52/// short-circuits the generic path — useful when the schema is
53/// well-known and the generic JSON walk would miss it (e.g. text-only
54/// Grep output, plain newline-separated paths).
55fn builtin_extract(
56    prev_tool: &str,
57    prev_result: &Value,
58    link: &FollowUpLink,
59) -> Option<Vec<Value>> {
60    match (prev_tool, link.tool.as_str()) {
61        ("Glob", "Read") | ("Glob", "Grep") => Some(extract_glob_paths(
62            prev_result,
63            link.projection_arg.as_deref().unwrap_or("file_path"),
64        )),
65        ("Grep", "Read") | ("Grep", "Edit") => Some(extract_grep_paths(
66            prev_result,
67            link.projection_arg.as_deref().unwrap_or("file_path"),
68        )),
69        ("WebSearch", "WebFetch") => Some(extract_websearch_urls(
70            prev_result,
71            link.projection_arg.as_deref().unwrap_or("url"),
72        )),
73        _ => None,
74    }
75}
76
77/// Glob output is one of:
78///   - JSON array of strings: `["src/main.rs", "src/lib.rs"]`
79///   - JSON array of objects with a `path` / `match_path` field
80///   - newline-separated text body: `src/main.rs\nsrc/lib.rs\n`
81fn extract_glob_paths(prev_result: &Value, arg_name: &str) -> Vec<Value> {
82    let paths = if let Some(arr) = prev_result.as_array() {
83        arr.iter()
84            .filter_map(|v| {
85                v.as_str()
86                    .map(String::from)
87                    .or_else(|| string_field(v, "path"))
88                    .or_else(|| string_field(v, "match_path"))
89            })
90            .collect::<Vec<_>>()
91    } else if let Some(s) = prev_result.as_str() {
92        s.lines()
93            .map(str::trim)
94            .filter(|l| !l.is_empty())
95            .map(String::from)
96            .collect()
97    } else {
98        Vec::new()
99    };
100
101    paths
102        .into_iter()
103        .take(MAX_PROJECTIONS_PER_LINK)
104        .map(|p| single_arg(arg_name, Value::String(p)))
105        .collect()
106}
107
108/// Grep output line shape: `path:line:col:match` or `path:match`.
109/// We dedup by path (first hit wins) and take the top N — the agent
110/// is far more likely to read each unique file once than to read the
111/// same file three times.
112fn extract_grep_paths(prev_result: &Value, arg_name: &str) -> Vec<Value> {
113    let body = match prev_result {
114        Value::String(s) => s.clone(),
115        // Grep wrappers sometimes return JSON arrays of match objects
116        // — handle both shapes.
117        Value::Array(arr) => {
118            let mut seen: Vec<String> = Vec::new();
119            for v in arr {
120                if let Some(p) = string_field(v, "path").or_else(|| string_field(v, "file"))
121                    && !seen.contains(&p)
122                {
123                    seen.push(p);
124                }
125                if seen.len() >= MAX_PROJECTIONS_PER_LINK {
126                    break;
127                }
128            }
129            return seen
130                .into_iter()
131                .map(|p| single_arg(arg_name, Value::String(p)))
132                .collect();
133        }
134        _ => return Vec::new(),
135    };
136
137    let mut seen: Vec<String> = Vec::new();
138    for line in body.lines() {
139        let trimmed = line.trim();
140        if trimmed.is_empty() {
141            continue;
142        }
143        // First colon-delimited field is the path. Skip Windows-style
144        // "C:\foo" by looking past the second character before splitting.
145        let path = trimmed.split(':').next().unwrap_or("").trim().to_string();
146        if path.is_empty() || seen.contains(&path) {
147            continue;
148        }
149        seen.push(path);
150        if seen.len() >= MAX_PROJECTIONS_PER_LINK {
151            break;
152        }
153    }
154    seen.into_iter()
155        .map(|p| single_arg(arg_name, Value::String(p)))
156        .collect()
157}
158
159/// WebSearch returns a list of `{title, url, snippet}` objects (or a
160/// `{results: […]}` wrapper). Top-1 by default order is the most-
161/// likely fetch — corpus shows the agent rarely fetches deeper than
162/// position 1.
163fn extract_websearch_urls(prev_result: &Value, arg_name: &str) -> Vec<Value> {
164    let arr = prev_result
165        .get("results")
166        .and_then(Value::as_array)
167        .or_else(|| prev_result.as_array());
168    let Some(arr) = arr else {
169        return Vec::new();
170    };
171    arr.iter()
172        .filter_map(|v| string_field(v, "url"))
173        .take(1)
174        .map(|u| single_arg(arg_name, Value::String(u)))
175        .collect()
176}
177
178/// Generic fallback — walks the JSON tree of `prev_result` and
179/// extracts every leaf string at field name `link.projection`. Emits
180/// one prefetch request per leaf, with `link.projection_arg` as the
181/// argument name. Caps at `MAX_PROJECTIONS_PER_LINK`.
182fn generic_extract(prev_result: &Value, link: &FollowUpLink) -> Vec<Value> {
183    let Some(field) = link.projection.as_deref() else {
184        return Vec::new();
185    };
186    let Some(arg_name) = link.projection_arg.as_deref() else {
187        return Vec::new();
188    };
189
190    let mut out: Vec<Value> = Vec::new();
191    walk(prev_result, field, &mut |v| {
192        out.push(single_arg(arg_name, v.clone()));
193        out.len() < MAX_PROJECTIONS_PER_LINK
194    });
195    out
196}
197
198/// Walk `v` depth-first, calling `visit(field_value)` for every leaf
199/// where the parent object's key equals `field`. Visitor returns
200/// `true` to continue, `false` to stop.
201fn walk(v: &Value, field: &str, visit: &mut impl FnMut(&Value) -> bool) -> bool {
202    match v {
203        Value::Object(map) => {
204            for (k, val) in map {
205                if k == field {
206                    let cont = visit(val);
207                    if !cont {
208                        return false;
209                    }
210                }
211                if !walk(val, field, visit) {
212                    return false;
213                }
214            }
215            true
216        }
217        Value::Array(arr) => {
218            for item in arr {
219                if !walk(item, field, visit) {
220                    return false;
221                }
222            }
223            true
224        }
225        _ => true,
226    }
227}
228
229fn string_field(v: &Value, name: &str) -> Option<String> {
230    v.get(name).and_then(Value::as_str).map(String::from)
231}
232
233/// Extract the host portion of a URL for rate-limit grouping.
234/// Returns the lower-cased host without scheme, port, path, or query.
235///
236/// The parser is intentionally tiny — no `url` crate dependency and no
237/// IDN normalisation. Edge cases like userinfo (`user:pass@`) and IPv6
238/// brackets are handled, but exotic forms (URN, mailto:) return `None`.
239///
240/// ```
241/// use devboy_format_pipeline::projection::extract_host;
242/// assert_eq!(extract_host("https://api.github.com/repos/x/y"), Some("api.github.com".into()));
243/// assert_eq!(extract_host("http://Example.COM:8080/foo"), Some("example.com".into()));
244/// assert_eq!(extract_host("https://user:p@host.example.org/x"), Some("host.example.org".into()));
245/// assert_eq!(extract_host("https://[::1]:80/p"), Some("[::1]".into()));
246/// assert_eq!(extract_host("/local/path"), None);
247/// ```
248pub fn extract_host(url: &str) -> Option<String> {
249    let after_scheme = url.split_once("://").map(|(_, rest)| rest)?;
250    // Strip everything past the authority section.
251    let authority = after_scheme.split(['/', '?', '#']).next()?;
252    if authority.is_empty() {
253        return None;
254    }
255    // Drop userinfo: `user:pass@host` → `host`.
256    let host_with_port = match authority.rsplit_once('@') {
257        Some((_, rest)) => rest,
258        None => authority,
259    };
260    // IPv6: keep the bracketed form `[::1]`, strip only the trailing port.
261    let host = if let Some(stripped) = host_with_port.strip_prefix('[') {
262        let close = stripped.find(']')?;
263        let inside = &stripped[..close];
264        // Re-add brackets so the dispatcher's per-host map can use the
265        // canonical `[::1]` form as a key.
266        format!("[{inside}]")
267    } else {
268        // Strip the `:port` suffix on a non-bracketed authority.
269        host_with_port
270            .rsplit_once(':')
271            .map(|(h, _)| h)
272            .unwrap_or(host_with_port)
273            .to_string()
274    };
275    if host.is_empty() {
276        return None;
277    }
278    Some(host.to_ascii_lowercase())
279}
280
281fn single_arg(name: &str, value: Value) -> Value {
282    let mut m = Map::new();
283    m.insert(name.to_string(), value);
284    Value::Object(m)
285}
286
287#[cfg(test)]
288mod tests {
289    use super::*;
290    use serde_json::json;
291
292    fn link(tool: &str, projection: &str, arg: &str) -> FollowUpLink {
293        FollowUpLink {
294            tool: tool.into(),
295            probability: 1.0,
296            projection: Some(projection.into()),
297            projection_arg: Some(arg.into()),
298        }
299    }
300
301    #[test]
302    fn glob_to_read_extracts_paths_from_array_of_strings() {
303        let result = json!(["src/main.rs", "src/lib.rs", "src/api.rs", "src/db.rs"]);
304        let l = link("Read", "match_path", "file_path");
305        let args = extract_args("Glob", &result, &l);
306        // MAX_PROJECTIONS_PER_LINK = 3 — fourth path drops.
307        assert_eq!(args.len(), 3);
308        assert_eq!(args[0]["file_path"], "src/main.rs");
309        assert_eq!(args[1]["file_path"], "src/lib.rs");
310        assert_eq!(args[2]["file_path"], "src/api.rs");
311    }
312
313    #[test]
314    fn glob_to_read_extracts_paths_from_array_of_objects() {
315        let result = json!([
316            {"path": "a.rs", "size": 100},
317            {"path": "b.rs", "size": 200},
318        ]);
319        let l = link("Read", "path", "file_path");
320        let args = extract_args("Glob", &result, &l);
321        assert_eq!(args.len(), 2);
322        assert_eq!(args[0]["file_path"], "a.rs");
323        assert_eq!(args[1]["file_path"], "b.rs");
324    }
325
326    #[test]
327    fn glob_to_read_extracts_paths_from_text_body() {
328        let result = Value::String("src/main.rs\n\nsrc/lib.rs\n  src/api.rs  \n".into());
329        let l = link("Read", "match_path", "file_path");
330        let args = extract_args("Glob", &result, &l);
331        assert_eq!(args.len(), 3);
332        assert_eq!(args[0]["file_path"], "src/main.rs");
333        assert_eq!(args[1]["file_path"], "src/lib.rs");
334        assert_eq!(args[2]["file_path"], "src/api.rs");
335    }
336
337    #[test]
338    fn grep_to_read_dedups_by_path() {
339        // Real grep output: same file appears on multiple lines, planner
340        // must not prefetch the same Read three times.
341        let result = Value::String(
342            "src/main.rs:10:fn foo() {}\n\
343             src/main.rs:42:fn bar() {}\n\
344             src/lib.rs:5:fn baz() {}\n\
345             src/db.rs:1:use std;\n"
346                .into(),
347        );
348        let l = link("Read", "path", "file_path");
349        let args = extract_args("Grep", &result, &l);
350        assert_eq!(args.len(), 3);
351        let paths: Vec<&str> = args
352            .iter()
353            .map(|a| a["file_path"].as_str().unwrap())
354            .collect();
355        assert_eq!(paths, vec!["src/main.rs", "src/lib.rs", "src/db.rs"]);
356    }
357
358    #[test]
359    fn grep_to_read_handles_array_of_objects() {
360        let result = json!([
361            {"path": "a.rs", "line": 1},
362            {"path": "a.rs", "line": 2},
363            {"path": "b.rs", "line": 1},
364        ]);
365        let l = link("Read", "path", "file_path");
366        let args = extract_args("Grep", &result, &l);
367        assert_eq!(args.len(), 2);
368        assert_eq!(args[0]["file_path"], "a.rs");
369        assert_eq!(args[1]["file_path"], "b.rs");
370    }
371
372    #[test]
373    fn websearch_to_webfetch_takes_top_url_only() {
374        let result = json!({
375            "results": [
376                {"title": "First",  "url": "https://example.com/a", "snippet": "…"},
377                {"title": "Second", "url": "https://example.com/b", "snippet": "…"},
378            ]
379        });
380        let l = link("WebFetch", "url", "url");
381        let args = extract_args("WebSearch", &result, &l);
382        // Top-1 only: corpus shows the agent rarely fetches deeper.
383        assert_eq!(args.len(), 1);
384        assert_eq!(args[0]["url"], "https://example.com/a");
385    }
386
387    #[test]
388    fn generic_fallback_walks_nested_objects() {
389        let result = json!({
390            "outer": {
391                "inner": [
392                    {"id": 1, "deep": {"target_field": "value-1"}},
393                    {"id": 2, "deep": {"target_field": "value-2"}},
394                ]
395            }
396        });
397        let l = FollowUpLink {
398            tool: "custom_get".into(),
399            probability: 1.0,
400            projection: Some("target_field".into()),
401            projection_arg: Some("identifier".into()),
402        };
403        let args = extract_args("custom_list", &result, &l);
404        assert_eq!(args.len(), 2);
405        assert_eq!(args[0]["identifier"], "value-1");
406        assert_eq!(args[1]["identifier"], "value-2");
407    }
408
409    #[test]
410    fn generic_fallback_returns_empty_when_projection_missing() {
411        let result = json!({"x": 1});
412        let l = FollowUpLink {
413            tool: "next".into(),
414            probability: 1.0,
415            // No projection — generic path can't extract anything.
416            ..FollowUpLink::default()
417        };
418        let args = extract_args("prev", &result, &l);
419        assert!(args.is_empty());
420    }
421
422    // ─── extract_host ────────────────────────────────────────────────
423
424    #[test]
425    fn extract_host_strips_scheme_and_path() {
426        assert_eq!(
427            extract_host("https://api.github.com/repos/x/y"),
428            Some("api.github.com".into())
429        );
430        assert_eq!(
431            extract_host("https://gitlab.example.com/project/-/issues"),
432            Some("gitlab.example.com".into())
433        );
434    }
435
436    #[test]
437    fn extract_host_lowercases_and_drops_port() {
438        assert_eq!(
439            extract_host("http://Example.COM:8080/foo"),
440            Some("example.com".into())
441        );
442        assert_eq!(
443            extract_host("https://API.OPENAI.COM"),
444            Some("api.openai.com".into())
445        );
446    }
447
448    #[test]
449    fn extract_host_handles_userinfo() {
450        assert_eq!(
451            extract_host("https://user:pass@host.example.org/x"),
452            Some("host.example.org".into())
453        );
454        assert_eq!(
455            extract_host("ftp://anonymous@ftp.example.org"),
456            Some("ftp.example.org".into())
457        );
458    }
459
460    #[test]
461    fn extract_host_keeps_ipv6_brackets() {
462        assert_eq!(extract_host("https://[::1]:80/p"), Some("[::1]".into()));
463        assert_eq!(
464            extract_host("http://[2001:db8::1]/foo"),
465            Some("[2001:db8::1]".into())
466        );
467    }
468
469    #[test]
470    fn extract_host_returns_none_for_non_urls() {
471        assert!(extract_host("/local/path").is_none());
472        assert!(extract_host("just-a-string").is_none());
473        assert!(extract_host("").is_none());
474        assert!(extract_host("https://").is_none());
475    }
476
477    #[test]
478    fn extract_host_strips_query_and_fragment() {
479        assert_eq!(
480            extract_host("https://example.com/foo?bar=1&baz=2"),
481            Some("example.com".into())
482        );
483        assert_eq!(
484            extract_host("https://example.com#anchor"),
485            Some("example.com".into())
486        );
487    }
488
489    #[test]
490    fn unknown_chain_falls_through_to_generic() {
491        let result = json!({"items": [{"key": "k1"}, {"key": "k2"}]});
492        let l = FollowUpLink {
493            tool: "consume".into(),
494            probability: 1.0,
495            projection: Some("key".into()),
496            projection_arg: Some("name".into()),
497        };
498        let args = extract_args("produce", &result, &l);
499        assert_eq!(args.len(), 2);
500        assert_eq!(args[0]["name"], "k1");
501        assert_eq!(args[1]["name"], "k2");
502    }
503}