crw_mcp_proto/
lib.rs

1//! Shared MCP (Model Context Protocol) JSON-RPC types and tool definitions.
2//!
3//! Used by both the HTTP MCP endpoint (`crw-server`) and the stdio MCP proxy (`crw-mcp`).
4
5use serde::{Deserialize, Serialize};
6use serde_json::{Value, json};
7
8/// MCP spec revision advertised in the `initialize` handshake (lib.rs `initialize`
9/// arm). Bumped from "2024-11-05" to "2025-06-18" to legitimize tool `outputSchema`
10/// and result `structuredContent`, both introduced in the 2025-06-18 revision.
11/// There is no per-feature capability flag for structured output, so advertising
12/// the revision that defines it is the only spec-legal way to emit it.
13///
14/// NOTE: `crw-browse` is a separate rmcp-based MCP server that pins its own
15/// `ProtocolVersion::V_2024_11_05` (crw-browse/src/server.rs) and does NOT consume
16/// this constant — it intentionally stays on 2024-11-05.
17pub const PROTOCOL_VERSION: &str = "2025-06-18";
18
19// --- JSON-RPC types ---
20
21#[derive(Deserialize)]
22pub struct JsonRpcRequest {
23    pub jsonrpc: String,
24    pub id: Option<Value>,
25    pub method: String,
26    #[serde(default)]
27    pub params: Value,
28}
29
30#[derive(Serialize)]
31pub struct JsonRpcResponse {
32    pub jsonrpc: String,
33    pub id: Value,
34    #[serde(skip_serializing_if = "Option::is_none")]
35    pub result: Option<Value>,
36    #[serde(skip_serializing_if = "Option::is_none")]
37    pub error: Option<JsonRpcError>,
38}
39
40#[derive(Serialize)]
41pub struct JsonRpcError {
42    pub code: i64,
43    pub message: String,
44}
45
46impl JsonRpcResponse {
47    pub fn success(id: Value, result: Value) -> Self {
48        Self {
49            jsonrpc: "2.0".into(),
50            id,
51            result: Some(result),
52            error: None,
53        }
54    }
55
56    pub fn error(id: Value, code: i64, message: String) -> Self {
57        Self {
58            jsonrpc: "2.0".into(),
59            id,
60            result: None,
61            error: Some(JsonRpcError { code, message }),
62        }
63    }
64}
65
66// --- Tool definitions ---
67
68pub fn tool_definitions(proxy_mode: bool) -> Value {
69    let mut tools = vec![
70        json!({
71            "name": "crw_scrape",
72            "title": "Scrape URL",
73            "description": "Scrape one URL to markdown, HTML, or links.",
74            "annotations": {
75                "readOnlyHint": true,
76                "destructiveHint": false,
77                "idempotentHint": true,
78                "openWorldHint": true
79            },
80            "inputSchema": {
81                "type": "object",
82                "properties": {
83                    "url": {
84                        "type": "string",
85                        "description": "URL to scrape"
86                    },
87                    "formats": {
88                        "type": "array",
89                        "items": { "type": "string", "enum": ["markdown", "html", "links"] },
90                        "description": "Output formats (default [\"markdown\"])"
91                    },
92                    "onlyMainContent": {
93                        "type": "boolean",
94                        "description": "Strip nav/footer; main content only (default true)"
95                    },
96                    "includeTags": {
97                        "type": "array",
98                        "items": { "type": "string" },
99                        "description": "CSS selectors to include"
100                    },
101                    "excludeTags": {
102                        "type": "array",
103                        "items": { "type": "string" },
104                        "description": "CSS selectors to exclude"
105                    },
106                    "renderJs": {
107                        "type": "boolean",
108                        "description": "Force JS render (true), HTTP-only (false), omit = auto"
109                    },
110                    "waitFor": {
111                        "type": "integer",
112                        "description": "Ms to wait after JS render for late content"
113                    },
114                    "maxLength": {
115                        "type": "integer",
116                        "minimum": 0,
117                        "description": "Max chars per content field; 0 = unbounded (default ~15000)"
118                    },
119                    "renderer": {
120                        "type": "string",
121                        "enum": ["auto", "lightpanda", "chrome", "playwright"],
122                        "description": "Pin renderer; non-auto hard-pins and implies renderJs:true (default auto)"
123                    }
124                },
125                "required": ["url"]
126            }
127        }),
128        json!({
129            "name": "crw_crawl",
130            "title": "Crawl site",
131            "description": "Start an async site crawl; returns a job id to poll with crw_check_crawl_status.",
132            // Starting a crawl creates server-side job state (a side effect), so
133            // this is NOT read-only and NOT idempotent.
134            "annotations": {
135                "readOnlyHint": false,
136                "destructiveHint": false,
137                "idempotentHint": false,
138                "openWorldHint": true
139            },
140            "inputSchema": {
141                "type": "object",
142                "properties": {
143                    "url": {
144                        "type": "string",
145                        "description": "Starting URL"
146                    },
147                    "maxDepth": {
148                        "type": "integer",
149                        "description": "Max crawl depth (default 2)"
150                    },
151                    "maxPages": {
152                        "type": "integer",
153                        "description": "Max pages to crawl (default 10)"
154                    },
155                    "jsonSchema": {
156                        "type": "object",
157                        "description": "JSON schema for LLM extraction per page"
158                    },
159                    "renderJs": {
160                        "type": "boolean",
161                        "description": "Force JS render (true), HTTP-only (false), omit = auto"
162                    },
163                    "waitFor": {
164                        "type": "integer",
165                        "description": "Ms to wait after JS render per page"
166                    },
167                    "renderer": {
168                        "type": "string",
169                        "enum": ["auto", "lightpanda", "chrome", "playwright"],
170                        "description": "Pin renderer; non-auto hard-pins and implies renderJs:true (default auto)"
171                    }
172                },
173                "required": ["url"]
174            }
175        }),
176        json!({
177            "name": "crw_check_crawl_status",
178            "title": "Check crawl status",
179            "description": "Poll an async crawl job and retrieve its pages.",
180            "annotations": {
181                "readOnlyHint": true,
182                "destructiveHint": false,
183                "idempotentHint": true,
184                "openWorldHint": true
185            },
186            "inputSchema": {
187                "type": "object",
188                "properties": {
189                    "id": {
190                        "type": "string",
191                        "description": "Crawl job id from crw_crawl"
192                    },
193                    "maxLength": {
194                        "type": "integer",
195                        "minimum": 0,
196                        "description": "Max chars per page content field; 0 = unbounded (default ~15000)"
197                    }
198                },
199                "required": ["id"]
200            }
201        }),
202        json!({
203            "name": "crw_map",
204            "title": "Map site URLs",
205            "description": "Discover URLs on a site via sitemap and/or a short crawl. Returns a URL list only, no page content.",
206            "annotations": {
207                "readOnlyHint": true,
208                "destructiveHint": false,
209                "idempotentHint": true,
210                "openWorldHint": true
211            },
212            "inputSchema": {
213                "type": "object",
214                "properties": {
215                    "url": {
216                        "type": "string",
217                        "description": "URL to map"
218                    },
219                    "maxDepth": {
220                        "type": "integer",
221                        "description": "Max discovery depth (default 2)"
222                    },
223                    "useSitemap": {
224                        "type": "boolean",
225                        "description": "Use sitemap.xml (default true)"
226                    },
227                    "crawlFallback": {
228                        "type": "boolean",
229                        "description": "Supplement sitemap with a short BFS crawl (default true; false = sitemap-only)"
230                    },
231                    "limit": {
232                        "type": "integer",
233                        "minimum": 0,
234                        "description": "Max URLs returned; 0 = unbounded (default 100)"
235                    }
236                },
237                "required": ["url"]
238            }
239        }),
240    ];
241
242    // `crw_search` is always advertised. In embedded mode it dispatches to a
243    // local SearXNG sidecar via crw-server's `/v1/search` pipeline; in proxy
244    // mode it forwards to the configured remote API. Whether the underlying
245    // SearXNG instance is configured is a runtime concern — the server returns
246    // a clear `search_disabled` error when [search].searxng_url is unset.
247    let _ = proxy_mode;
248    tools.push(json!({
249        "name": "crw_search",
250        "title": "Web search",
251        "description": "Search the web (needs a configured search backend; embedded uses a local SearXNG sidecar). Returns results with url/title/description/snippet.",
252        "annotations": {
253            "readOnlyHint": true,
254            "destructiveHint": false,
255            "idempotentHint": true,
256            "openWorldHint": true
257        },
258        "inputSchema": {
259            "type": "object",
260            "properties": {
261                "query": {
262                    "type": "string",
263                    "description": "Search query"
264                },
265                "limit": {
266                    "type": "integer",
267                    "description": "Max results (default 5, max 20)"
268                },
269                "lang": {
270                    "type": "string",
271                    "description": "Language code, e.g. \"en\", \"tr\""
272                },
273                "country": {
274                    "type": "string",
275                    "description": "Country code hint, e.g. \"us\", \"tr\""
276                },
277                "tbs": {
278                    "type": "string",
279                    "enum": ["qdr:h", "qdr:d", "qdr:w", "qdr:m", "qdr:y"],
280                    "description": "Time filter: past hour/day/week/month/year"
281                },
282                "sources": {
283                    "type": "array",
284                    "items": { "type": "string", "enum": ["web", "news", "images"] },
285                    "description": "If set, group results by source instead of a flat list"
286                },
287                "categories": {
288                    "type": "array",
289                    "items": { "type": "string" },
290                    "description": "Category bias; e.g. \"pdf\", \"github\", \"research\", or a native SearXNG category"
291                },
292                "scrapeOptions": {
293                    "type": "object",
294                    "description": "If set, scrape each web result and inline the requested formats",
295                    "properties": {
296                        "formats": {
297                            "type": "array",
298                            "items": { "type": "string", "enum": ["markdown", "html", "rawHtml", "links"] }
299                        },
300                        "onlyMainContent": {
301                            "type": "boolean",
302                            "description": "Strip nav/footer/ads (default true)"
303                        }
304                    }
305                }
306            },
307            "required": ["query"]
308        },
309        // Intentionally minimal: declares the stable top-level contract
310        // (`{success, data:{results}}`) that strict clients validate, while leaving
311        // `results` permissive — it is a `#[serde(untagged)]` enum that serializes
312        // either as a flat array OR a grouped `{web,news,images}` object, and items
313        // carry conditional fields (markdown/html/links/imageUrl/…). A rich schema
314        // here costs ~400 tok in every `tools/list` for little client benefit and
315        // risks falsely rejecting real responses, so we keep it skeletal. No
316        // `additionalProperties:false` anywhere (conditional fields).
317        "outputSchema": {
318            "type": "object",
319            "properties": {
320                "success": { "type": "boolean" },
321                "data": {
322                    "type": "object",
323                    "properties": {
324                        "results": {
325                            "oneOf": [
326                                { "type": "array", "items": { "type": "object" } },
327                                { "type": "object" }
328                            ]
329                        }
330                    },
331                    "required": ["results"]
332                }
333            },
334            "required": ["success", "data"]
335        }
336    }));
337
338    tools.push(json!({
339        "name": "crw_parse_file",
340        "title": "Parse PDF",
341        "description": "Parse a local PDF (base64 in contentBase64) to markdown. No OCR: scanned PDFs return empty markdown with a warning.",
342        // openWorldHint:false — operates on provided bytes, not the open web.
343        "annotations": {
344            "readOnlyHint": true,
345            "destructiveHint": false,
346            "idempotentHint": true,
347            "openWorldHint": false
348        },
349        "inputSchema": {
350            "type": "object",
351            "properties": {
352                "contentBase64": {
353                    "type": "string",
354                    "description": "Base64-encoded PDF bytes"
355                },
356                "filename": {
357                    "type": "string",
358                    "description": "Original filename (optional)"
359                },
360                "formats": {
361                    "type": "array",
362                    "items": { "type": "string", "enum": ["markdown", "plainText", "links", "json", "summary"] },
363                    "description": "Output formats (default [\"markdown\"]); json/summary need a server LLM"
364                },
365                "jsonSchema": {
366                    "type": "object",
367                    "description": "JSON schema for LLM extraction (when formats has json)"
368                },
369                "parsers": {
370                    "type": "array",
371                    "items": { "type": "string", "enum": ["pdf"] },
372                    "description": "Parsers to apply (default [\"pdf\"])"
373                },
374                "maxLength": {
375                    "type": "integer",
376                    "minimum": 0,
377                    "description": "Max chars per content field; 0 = unbounded (default ~15000)"
378                }
379            },
380            "required": ["contentBase64"]
381        }
382    }));
383
384    json!({ "tools": tools })
385}
386
387/// Returns the declared `outputSchema` for a tool, if it declares one.
388///
389/// Single source of truth: `structuredContent` emission is derived from the
390/// same `tool_definitions` declaration that `tools/list` advertises, so the two
391/// can never drift. Recomputes `tool_definitions` per call — `tools/call` is not
392/// hot; memoize behind a `OnceLock` only if profiling ever demands it.
393pub fn tool_output_schema(tool_name: &str) -> Option<Value> {
394    tool_definitions(false)["tools"]
395        .as_array()?
396        .iter()
397        .find(|t| t["name"] == tool_name)
398        .and_then(|t| t.get("outputSchema").cloned())
399}
400
401/// Whether `name` is one of the server's tool names. A genuinely unknown tool
402/// should be answered with a JSON-RPC `-32602` protocol error (clients degrade
403/// more gracefully than on an `isError` execution result). Checks the full set
404/// regardless of runtime availability (e.g. `crw_search` is a known name even when
405/// no search backend is configured — calling it then yields a clear runtime error).
406pub fn is_known_tool(name: &str) -> bool {
407    tool_definitions(false)["tools"]
408        .as_array()
409        .is_some_and(|tools| tools.iter().any(|t| t["name"] == name))
410}
411
412/// Result of handling a protocol method.
413pub enum ProtocolResult {
414    /// Send this response back to the client.
415    Response(JsonRpcResponse),
416    /// Notification — no response needed.
417    Notification,
418    /// Not a protocol method — caller should handle it.
419    NotHandled,
420}
421
422/// Handle common MCP protocol methods (initialize, tools/list, ping, notifications).
423///
424/// `search_available` controls whether `crw_search` is advertised in `tools/list`.
425/// Proxy callers pass `true` (the remote decides); embedded callers pass whether a
426/// search backend (SearXNG) is actually configured, so users who run `npx … crw`
427/// with no backend don't see a tool that only ever returns `search_disabled`.
428pub fn handle_protocol_method(
429    server_name: &str,
430    server_version: &str,
431    req: &JsonRpcRequest,
432    proxy_mode: bool,
433    search_available: bool,
434) -> ProtocolResult {
435    if req.jsonrpc != "2.0" {
436        let id = req.id.clone().unwrap_or(Value::Null);
437        return ProtocolResult::Response(JsonRpcResponse::error(
438            id,
439            -32600,
440            "invalid jsonrpc version".into(),
441        ));
442    }
443
444    match req.method.as_str() {
445        "notifications/initialized" | "notifications/cancelled" => ProtocolResult::Notification,
446
447        "initialize" => {
448            let id = req.id.clone().unwrap_or(Value::Null);
449            ProtocolResult::Response(JsonRpcResponse::success(
450                id,
451                json!({
452                    "protocolVersion": PROTOCOL_VERSION,
453                    // The tool set is fixed for the lifetime of a session (it depends
454                    // only on startup config), so we never emit tools/list_changed.
455                    "capabilities": { "tools": { "listChanged": false } },
456                    "serverInfo": {
457                        "name": server_name,
458                        "version": server_version
459                    }
460                }),
461            ))
462        }
463
464        "tools/list" => {
465            let id = req.id.clone().unwrap_or(Value::Null);
466            let mut defs = tool_definitions(proxy_mode);
467            if !search_available
468                && let Some(tools) = defs.get_mut("tools").and_then(Value::as_array_mut)
469            {
470                tools.retain(|t| t["name"] != "crw_search");
471            }
472            ProtocolResult::Response(JsonRpcResponse::success(id, defs))
473        }
474
475        "ping" => {
476            let id = req.id.clone().unwrap_or(Value::Null);
477            ProtocolResult::Response(JsonRpcResponse::success(id, json!({})))
478        }
479
480        _ => ProtocolResult::NotHandled,
481    }
482}
483
484/// Wrap a tool call result into an MCP-compliant content response.
485///
486/// On success the structured `value` is emitted **both** as a text content block
487/// (verbatim, for backward compatibility with lenient clients and clients that
488/// negotiated an older protocol revision) **and**, when the called tool declares
489/// an `outputSchema`, as a top-level `structuredContent` field (MCP 2025-06-18)
490/// so strict clients can validate it. Both representations derive from the same
491/// `value` binding, so `serde_json::from_str(content[0].text) == structuredContent`
492/// holds by construction — the two can never disagree.
493pub fn tool_result_response(
494    id: Value,
495    tool_name: &str,
496    result: Result<Value, String>,
497) -> JsonRpcResponse {
498    match result {
499        Ok(value) => {
500            // Compact (not pretty) — pretty-printing adds ~30% whitespace, and this
501            // text block is injected verbatim into the agent's context.
502            let text = serde_json::to_string(&value).unwrap_or_default();
503            let mut payload = json!({
504                "content": [{"type": "text", "text": text}]
505            });
506            // Attach structuredContent only when (a) the tool declares an
507            // outputSchema and (b) the value is a JSON object — the spec requires
508            // structuredContent to be an object. The `is_object()` guard is the
509            // proxy version-skew safety valve: in proxy mode a schema-bearing tool
510            // may yield a non-object Ok value (an upstream error string, a plain
511            // string, or a legacy top-level array) — degrade to text-only rather
512            // than ship a spec-violating structuredContent to a strict client.
513            // Locked by test T2b. Do NOT remove the is_object() guard.
514            if value.is_object() && tool_output_schema(tool_name).is_some() {
515                payload["structuredContent"] = value;
516            }
517            JsonRpcResponse::success(id, payload)
518        }
519        // Err path: never attach structuredContent. `isError:true` signals
520        // failure, and strict clients must not validate outputSchema against an
521        // error result.
522        Err(e) => JsonRpcResponse::success(
523            id,
524            json!({
525                "content": [{"type": "text", "text": e}],
526                "isError": true
527            }),
528        ),
529    }
530}
531
532// --- Output bounding (MCP-layer, context-footprint control) ---
533
534/// Default per-content-field char cap for scrape/parse/crawl-status results.
535/// ~15K chars ≈ ~3.5–4K tokens — well under the typical ~25K-token client cap.
536pub const DEFAULT_MAX_LENGTH: usize = 15_000;
537/// Default cap on the number of URLs `crw_map` returns to the model.
538pub const DEFAULT_MAP_LIMIT: usize = 100;
539
540/// Large string fields on a serialized `ScrapeData` (camelCase) worth truncating.
541const SCRAPE_TEXT_FIELDS: &[&str] = &["markdown", "html", "rawHtml", "plainText", "summary"];
542
543/// Resolve an MCP-only bound argument. Returns:
544/// - `Some(default)` when the arg is absent,
545/// - `None` (= unbounded) when the arg is explicitly `0`,
546/// - `Some(n)` for a positive value.
547fn resolve_bound(args: &Value, key: &str, default: usize) -> Option<usize> {
548    match args.get(key).and_then(Value::as_u64) {
549        None => Some(default),
550        Some(0) => None,
551        Some(n) => Some(n as usize),
552    }
553}
554
555/// Truncate a string to at most `max_chars` characters on a char boundary,
556/// appending a visible marker. Returns `None` if no truncation was needed.
557fn truncate_to_chars(s: &str, max_chars: usize) -> Option<String> {
558    // `nth(max_chars)` yields the (max_chars+1)-th char; its byte offset is where
559    // we cut to keep exactly `max_chars` chars. Absent → string is short enough.
560    s.char_indices()
561        .nth(max_chars)
562        .map(|(byte_idx, _)| format!("{}\n…[truncated by crw-mcp maxLength]", &s[..byte_idx]))
563}
564
565/// Truncate the known large text fields of one serialized `ScrapeData` object,
566/// tagging it with `truncated: true` if anything was cut. Non-recursive.
567fn truncate_scrape_obj(value: &mut Value, max: usize) {
568    let Some(obj) = value.as_object_mut() else {
569        return;
570    };
571    let mut any = false;
572    for field in SCRAPE_TEXT_FIELDS {
573        let cut = match obj.get(*field) {
574            Some(Value::String(s)) => truncate_to_chars(s, max),
575            _ => None,
576        };
577        if let Some(t) = cut {
578            obj.insert((*field).to_string(), Value::String(t));
579            any = true;
580        }
581    }
582    if any {
583        obj.insert("truncated".to_string(), Value::Bool(true));
584    }
585}
586
587/// The single `ScrapeData`-shaped object to truncate. The **embedded** backend
588/// returns the bare `ScrapeData` (fields at the top level); the **proxy** backend
589/// forwards the REST `ApiResponse<ScrapeData>` envelope (`{success, data:{…}}`).
590/// We unwrap the `data` envelope when present so both shapes are bounded identically.
591fn scrape_target_mut(value: &mut Value) -> Option<&mut Value> {
592    if value.get("data").is_some_and(Value::is_object) {
593        value.get_mut("data")
594    } else if value.is_object() {
595        Some(value)
596    } else {
597        None
598    }
599}
600
601/// Truncate the `links` list to `limit` with markers, wherever it lives: top-level
602/// (embedded `{success, links}`) or under the `data` envelope (proxy
603/// `ApiResponse<MapData>` = `{success, data:{links}}`).
604fn bound_map_links(value: &mut Value, limit: usize) {
605    let in_envelope = value.get("data").and_then(|d| d.get("links")).is_some();
606    let Some(container) = (if in_envelope {
607        value.get_mut("data")
608    } else {
609        Some(&mut *value)
610    }) else {
611        return;
612    };
613    let Some(total) = container
614        .get("links")
615        .and_then(Value::as_array)
616        .map(Vec::len)
617    else {
618        return;
619    };
620    if total <= limit {
621        return;
622    }
623    if let Some(obj) = container.as_object_mut() {
624        if let Some(Value::Array(links)) = obj.get_mut("links") {
625            links.truncate(limit);
626        }
627        obj.insert("totalDiscovered".to_string(), json!(total));
628        obj.insert("truncated".to_string(), Value::Bool(true));
629    }
630}
631
632/// Truncate any scrape content inlined into `crw_search` results (via
633/// `scrapeOptions`). `results` lives at `data.results` and is either a flat array
634/// of items or a grouped `{web,news,images}` object of arrays.
635fn bound_search_results(value: &mut Value, max: usize) {
636    let Some(results) = value.get_mut("data").and_then(|d| d.get_mut("results")) else {
637        return;
638    };
639    match results {
640        Value::Array(items) => {
641            for item in items.iter_mut() {
642                truncate_scrape_obj(item, max);
643            }
644        }
645        Value::Object(groups) => {
646            for arr in groups.values_mut() {
647                if let Some(items) = arr.as_array_mut() {
648                    for item in items.iter_mut() {
649                        truncate_scrape_obj(item, max);
650                    }
651                }
652            }
653        }
654        _ => {}
655    }
656}
657
658/// Bound a tool result's size at the MCP layer, driven by the call's own
659/// `maxLength`/`limit` arguments (see [`resolve_bound`] for the `0 = unbounded`
660/// opt-out). **Non-mutating** w.r.t. any stored state: it transforms an owned
661/// `Value` produced by the dispatch and returns a new one. Shared by the embedded,
662/// proxy, and CLI paths, and handles BOTH the bare (embedded) and `ApiResponse`-
663/// enveloped (proxy) result shapes so the two behave identically.
664pub fn apply_bounds(tool_name: &str, args: &Value, mut value: Value) -> Value {
665    match tool_name {
666        "crw_scrape" | "crw_parse_file" => {
667            if let Some(max) = resolve_bound(args, "maxLength", DEFAULT_MAX_LENGTH)
668                && let Some(target) = scrape_target_mut(&mut value)
669            {
670                truncate_scrape_obj(target, max);
671            }
672        }
673        "crw_check_crawl_status" => {
674            // CrawlState is returned bare (top-level `data` array) by both the
675            // embedded backend and the REST `GET /v1/crawl/{id}` endpoint.
676            if let Some(max) = resolve_bound(args, "maxLength", DEFAULT_MAX_LENGTH)
677                && let Some(pages) = value.get_mut("data").and_then(Value::as_array_mut)
678            {
679                for page in pages.iter_mut() {
680                    truncate_scrape_obj(page, max);
681                }
682            }
683        }
684        "crw_map" => {
685            if let Some(limit) = resolve_bound(args, "limit", DEFAULT_MAP_LIMIT) {
686                bound_map_links(&mut value, limit);
687            }
688        }
689        "crw_search" => {
690            if let Some(max) = resolve_bound(args, "maxLength", DEFAULT_MAX_LENGTH) {
691                bound_search_results(&mut value, max);
692            }
693        }
694        _ => {}
695    }
696    value
697}
698
699/// Remove MCP-only control args (`maxLength`, `crw_map`'s `limit`) before a proxy
700/// forwards the call to a REST endpoint that may reject unknown body fields. These
701/// are applied locally via [`apply_bounds`] on the response instead. Note
702/// `crw_search.limit` is a *real* backend param and is intentionally NOT stripped.
703pub fn strip_mcp_only_args(tool_name: &str, mut args: Value) -> Value {
704    if let Some(obj) = args.as_object_mut() {
705        match tool_name {
706            "crw_scrape" | "crw_parse_file" | "crw_check_crawl_status" => {
707                obj.remove("maxLength");
708            }
709            "crw_map" => {
710                obj.remove("limit");
711            }
712            _ => {}
713        }
714    }
715    args
716}
717
718#[cfg(test)]
719mod tests {
720    use super::*;
721
722    fn tool_by_name<'a>(tools: &'a Value, name: &str) -> &'a Value {
723        tools["tools"]
724            .as_array()
725            .expect("tools array")
726            .iter()
727            .find(|t| t["name"] == name)
728            .unwrap_or_else(|| panic!("tool {name} not found"))
729    }
730
731    /// Token-budget regression gate for the `tools/list` payload. Every byte here
732    /// is injected into the agent's context on every turn, so this is the server's
733    /// single most important footprint metric.
734    ///
735    /// We estimate tokens as `ceil(bytes / 3)` — a deliberately *conservative*
736    /// (over-counting) heuristic: symbol-heavy JSON tokenizes at ~3–4 chars/token,
737    /// so if this estimate is under the ceiling the real (tiktoken/cl100k) count is
738    /// comfortably under too. A real tokenizer (`tiktoken-rs`) was considered but
739    /// rejected to keep this leaf crate dependency-free; the conservative estimate
740    /// is sufficient for a regression gate. Real cl100k count is ~25–30% lower.
741    ///
742    /// Baseline before the Phase 1 trim was 8233 bytes (~2744 est-tok). After the
743    /// Phase 1 trim + Phase 3 annotations/titles the full 6-tool list is ~6189 bytes
744    /// (~2063 est-tok ≈ ~1450 real cl100k tok). The ceiling is floor + ~11% so the
745    /// gate catches real bloat without churning on minor edits.
746    const TOOLS_LIST_TOKEN_CEILING: usize = 2300;
747
748    #[test]
749    fn tools_list_token_budget() {
750        let json = serde_json::to_string(&tool_definitions(false)).unwrap();
751        let est_tokens = json.len().div_ceil(3);
752        assert!(
753            est_tokens <= TOOLS_LIST_TOKEN_CEILING,
754            "tools/list footprint regressed: {} bytes ≈ {} est-tokens (ceiling {}). \
755             Trim descriptions/schemas before raising the ceiling.",
756            json.len(),
757            est_tokens,
758            TOOLS_LIST_TOKEN_CEILING
759        );
760    }
761
762    #[test]
763    fn crw_scrape_schema_advertises_render_js() {
764        let defs = tool_definitions(false);
765        let scrape = tool_by_name(&defs, "crw_scrape");
766        let props = &scrape["inputSchema"]["properties"];
767        assert_eq!(
768            props["renderJs"]["type"], "boolean",
769            "renderJs must be a plain boolean in the advertised schema"
770        );
771        assert!(
772            props["renderJs"].get("default").is_none(),
773            "renderJs must not advertise a default — server resolves it"
774        );
775    }
776
777    #[test]
778    fn crw_scrape_schema_advertises_wait_for() {
779        let defs = tool_definitions(false);
780        let scrape = tool_by_name(&defs, "crw_scrape");
781        let props = &scrape["inputSchema"]["properties"];
782        assert_eq!(props["waitFor"]["type"], "integer");
783    }
784
785    #[test]
786    fn crw_scrape_render_js_not_required() {
787        let defs = tool_definitions(false);
788        let scrape = tool_by_name(&defs, "crw_scrape");
789        let required = scrape["inputSchema"]["required"]
790            .as_array()
791            .expect("required array");
792        assert!(
793            !required.iter().any(|v| v == "renderJs"),
794            "renderJs must not be in required"
795        );
796        assert!(
797            !required.iter().any(|v| v == "waitFor"),
798            "waitFor must not be in required"
799        );
800    }
801
802    #[test]
803    fn crw_crawl_schema_advertises_render_js_and_wait_for() {
804        let defs = tool_definitions(false);
805        let crawl = tool_by_name(&defs, "crw_crawl");
806        let props = &crawl["inputSchema"]["properties"];
807        assert_eq!(props["renderJs"]["type"], "boolean");
808        assert_eq!(props["waitFor"]["type"], "integer");
809    }
810
811    #[test]
812    fn crw_scrape_schema_advertises_renderer() {
813        let defs = tool_definitions(false);
814        let scrape = tool_by_name(&defs, "crw_scrape");
815        let props = &scrape["inputSchema"]["properties"];
816        assert_eq!(props["renderer"]["type"], "string");
817        let enum_vals = props["renderer"]["enum"]
818            .as_array()
819            .expect("renderer.enum must be an array");
820        assert_eq!(
821            enum_vals,
822            &vec![
823                json!("auto"),
824                json!("lightpanda"),
825                json!("chrome"),
826                json!("playwright"),
827            ]
828        );
829    }
830
831    #[test]
832    fn crw_scrape_renderer_not_required() {
833        let defs = tool_definitions(false);
834        let scrape = tool_by_name(&defs, "crw_scrape");
835        let required = scrape["inputSchema"]["required"]
836            .as_array()
837            .expect("required array");
838        assert!(!required.iter().any(|v| v == "renderer"));
839    }
840
841    #[test]
842    fn crw_crawl_schema_advertises_renderer() {
843        let defs = tool_definitions(false);
844        let crawl = tool_by_name(&defs, "crw_crawl");
845        let props = &crawl["inputSchema"]["properties"];
846        assert_eq!(props["renderer"]["type"], "string");
847        let enum_vals = props["renderer"]["enum"]
848            .as_array()
849            .expect("renderer.enum must be an array");
850        assert_eq!(enum_vals.len(), 4);
851        assert!(enum_vals.iter().any(|v| v == "chrome"));
852        assert!(enum_vals.iter().any(|v| v == "lightpanda"));
853        assert!(enum_vals.iter().any(|v| v == "auto"));
854        assert!(enum_vals.iter().any(|v| v == "playwright"));
855    }
856
857    #[test]
858    fn schemas_do_not_set_additional_properties_false() {
859        // Deferred to a follow-up issue. Guard against accidentally enabling
860        // this before the schemas are expanded to full ScrapeRequest parity.
861        let defs = tool_definitions(false);
862        for name in ["crw_scrape", "crw_crawl", "crw_map"] {
863            let tool = tool_by_name(&defs, name);
864            let ap = &tool["inputSchema"].get("additionalProperties");
865            assert!(
866                ap.is_none() || ap.as_ref().and_then(|v| v.as_bool()) != Some(false),
867                "{name}: additionalProperties:false must remain off until schemas are complete"
868            );
869        }
870    }
871
872    // --- structuredContent emission (issue #89) ---
873
874    /// A single text-result item with every always-emitted field set, plus the
875    /// optional `score`/`category`. `snippet` mirrors `description`, matching the
876    /// real `SearchResult` serializer (snippet is an alias of description).
877    fn search_result_item(idx: u32) -> Value {
878        json!({
879            "url": format!("https://example.com/{idx}"),
880            "title": format!("Result {idx}"),
881            "description": "body text",
882            "snippet": "body text",
883            "position": idx,
884            "score": 4.0,
885            "category": "general"
886        })
887    }
888
889    /// A representative flat (`sources` unset) crw_search success value, shaped
890    /// like `ApiResponse::ok(SearchResponseData { results: Flat(..), .. })`.
891    fn representative_search_value() -> Value {
892        json!({
893            "success": true,
894            "data": { "results": [search_result_item(1), search_result_item(2)] }
895        })
896    }
897
898    /// A representative grouped (`sources` set) value: `results` is an object with
899    /// `web`/`news` (text items) and `images` (the differently-shaped ImageResult).
900    fn grouped_search_value() -> Value {
901        json!({
902            "success": true,
903            "data": { "results": {
904                "web": [search_result_item(1)],
905                "news": [search_result_item(2)],
906                "images": [{
907                    "url": "https://example.com/img",
908                    "title": "An image",
909                    "description": "alt text",
910                    "imageUrl": "https://example.com/img.png",
911                    "position": 1
912                }]
913            }}
914        })
915    }
916
917    fn result_of(resp: &JsonRpcResponse) -> &Value {
918        resp.result.as_ref().expect("success response has result")
919    }
920
921    /// T1 — crw_search Ok emits BOTH a text block and structuredContent, and the
922    /// two are byte-for-byte the same value (single-source invariant).
923    #[test]
924    fn t1_search_emits_dual_content_in_sync() {
925        let repr = representative_search_value();
926        let resp = tool_result_response(json!(1), "crw_search", Ok(repr.clone()));
927        let result = result_of(&resp);
928
929        let text = result["content"][0]["text"]
930            .as_str()
931            .expect("text content present");
932        assert_eq!(
933            result["content"][0]["type"], "text",
934            "first content block is text"
935        );
936
937        let structured = &result["structuredContent"];
938        assert!(!structured.is_null(), "structuredContent present");
939        assert_eq!(
940            structured, &repr,
941            "structuredContent is the unmodified value"
942        );
943
944        let from_text: Value = serde_json::from_str(text).expect("text is valid JSON");
945        assert_eq!(
946            &from_text, structured,
947            "from_str(content.text) == structuredContent (no drift)"
948        );
949    }
950
951    /// T2 — a tool WITHOUT an outputSchema (crw_scrape) gets text only, no
952    /// structuredContent (schema-gated emission).
953    #[test]
954    fn t2_scrape_has_no_structured_content() {
955        let resp = tool_result_response(json!(1), "crw_scrape", Ok(json!({"markdown": "hi"})));
956        let result = result_of(&resp);
957        assert!(result["content"][0]["text"].is_string());
958        assert!(
959            result.get("structuredContent").is_none(),
960            "crw_scrape declares no outputSchema → no structuredContent"
961        );
962    }
963
964    /// T2b — proxy version-skew safety valve: a schema-bearing tool whose Ok
965    /// value is NOT an object (upstream error string, or a legacy top-level
966    /// array) degrades to text-only. Locks the is_object() guard.
967    #[test]
968    fn t2b_non_object_search_value_degrades_to_text() {
969        for non_object in [json!("upstream error string"), json!([{ "url": "x" }])] {
970            let resp = tool_result_response(json!(1), "crw_search", Ok(non_object.clone()));
971            let result = result_of(&resp);
972            assert!(
973                result["content"][0]["text"].is_string(),
974                "text block carries the body"
975            );
976            assert!(
977                result.get("structuredContent").is_none(),
978                "non-object Ok value must NOT emit structuredContent: {non_object}"
979            );
980        }
981    }
982
983    /// T3 — the Err path is an isError text result with no structuredContent.
984    #[test]
985    fn t3_error_path_has_no_structured_content() {
986        let resp = tool_result_response(json!(1), "crw_search", Err("boom".into()));
987        let result = result_of(&resp);
988        assert_eq!(result["isError"], true);
989        assert_eq!(result["content"][0]["text"], "boom");
990        assert!(result.get("structuredContent").is_none());
991    }
992
993    /// T4 — emitted structuredContent validates against the declared outputSchema
994    /// for both the flat and the grouped value (using the same builders the
995    /// real serializer would feed).
996    #[test]
997    fn t4_emitted_structured_content_validates_against_schema() {
998        let schema = tool_output_schema("crw_search").expect("crw_search has outputSchema");
999        let validator = jsonschema::validator_for(&schema).expect("schema compiles");
1000
1001        for value in [representative_search_value(), grouped_search_value()] {
1002            let resp = tool_result_response(json!(1), "crw_search", Ok(value.clone()));
1003            let structured = result_of(&resp)["structuredContent"].clone();
1004            let errors: Vec<String> = validator
1005                .iter_errors(&structured)
1006                .map(|e| e.to_string())
1007                .collect();
1008            assert!(
1009                errors.is_empty(),
1010                "structuredContent failed schema validation for {value}:\n{}",
1011                errors.join("\n")
1012            );
1013        }
1014    }
1015
1016    /// T5 — the helper is the single source of truth: present for crw_search,
1017    /// absent for crw_scrape, with the expected required-field structure.
1018    #[test]
1019    fn t5_tool_output_schema_helper() {
1020        let schema = tool_output_schema("crw_search").expect("crw_search has outputSchema");
1021        assert_eq!(schema["type"], "object");
1022        let required = schema["required"].as_array().expect("required array");
1023        assert_eq!(required, &vec![json!("success"), json!("data")]);
1024        assert_eq!(schema["properties"]["data"]["type"], "object");
1025        let data_required = schema["properties"]["data"]["required"]
1026            .as_array()
1027            .expect("data.required array");
1028        assert!(data_required.iter().any(|v| v == "results"));
1029
1030        assert!(
1031            tool_output_schema("crw_scrape").is_none(),
1032            "crw_scrape declares no outputSchema"
1033        );
1034    }
1035
1036    /// T6 — the additionalProperties:false guard is scoped to inputSchema only;
1037    /// the new outputSchema must not set it (the conditional SearchResult fields
1038    /// would make it falsely reject real responses).
1039    #[test]
1040    fn t6_output_schema_does_not_set_additional_properties_false() {
1041        let defs = tool_definitions(false);
1042        let search = tool_by_name(&defs, "crw_search");
1043        let ap = search["outputSchema"].get("additionalProperties");
1044        assert!(
1045            ap.is_none() || ap.and_then(|v| v.as_bool()) != Some(false),
1046            "crw_search outputSchema must not set additionalProperties:false"
1047        );
1048    }
1049
1050    // --- Output bounding (apply_bounds / strip_mcp_only_args) ---
1051
1052    fn long_md(chars: usize) -> String {
1053        "x".repeat(chars)
1054    }
1055
1056    /// B1 — crw_scrape truncates markdown past the default cap and tags `truncated`.
1057    #[test]
1058    fn b1_scrape_truncates_to_default_max_length() {
1059        let value =
1060            json!({ "markdown": long_md(DEFAULT_MAX_LENGTH + 500), "url": "https://e.com" });
1061        let out = apply_bounds("crw_scrape", &json!({}), value);
1062        let md = out["markdown"].as_str().unwrap();
1063        assert!(
1064            md.chars().count() <= DEFAULT_MAX_LENGTH + 40,
1065            "truncated to ~cap + marker"
1066        );
1067        assert!(md.contains("[truncated"), "marker present");
1068        assert_eq!(out["truncated"], json!(true));
1069    }
1070
1071    /// B2 — short content is untouched and gets no `truncated` flag.
1072    #[test]
1073    fn b2_scrape_short_content_untouched() {
1074        let value = json!({ "markdown": "hello", "url": "https://e.com" });
1075        let out = apply_bounds("crw_scrape", &json!({}), value);
1076        assert_eq!(out["markdown"], json!("hello"));
1077        assert!(out.get("truncated").is_none());
1078    }
1079
1080    /// B3 — explicit `maxLength: 0` opts out of bounding (unbounded).
1081    #[test]
1082    fn b3_scrape_max_length_zero_is_unbounded() {
1083        let big = long_md(DEFAULT_MAX_LENGTH * 2);
1084        let value = json!({ "markdown": big.clone() });
1085        let out = apply_bounds("crw_scrape", &json!({ "maxLength": 0 }), value);
1086        assert_eq!(
1087            out["markdown"].as_str().unwrap().chars().count(),
1088            big.chars().count()
1089        );
1090        assert!(out.get("truncated").is_none());
1091    }
1092
1093    /// B4 — a custom `maxLength` is honored.
1094    #[test]
1095    fn b4_scrape_custom_max_length() {
1096        let value = json!({ "markdown": long_md(100) });
1097        let out = apply_bounds("crw_scrape", &json!({ "maxLength": 10 }), value);
1098        let md = out["markdown"].as_str().unwrap();
1099        assert!(md.starts_with(&"x".repeat(10)));
1100        assert!(md.contains("[truncated"));
1101    }
1102
1103    /// B5 — crw_map truncates the links list to the default limit with markers.
1104    #[test]
1105    fn b5_map_truncates_links_to_limit() {
1106        let links: Vec<Value> = (0..250)
1107            .map(|i| json!(format!("https://e.com/{i}")))
1108            .collect();
1109        let value = json!({ "success": true, "links": links });
1110        let out = apply_bounds("crw_map", &json!({}), value);
1111        assert_eq!(out["links"].as_array().unwrap().len(), DEFAULT_MAP_LIMIT);
1112        assert_eq!(out["totalDiscovered"], json!(250));
1113        assert_eq!(out["truncated"], json!(true));
1114    }
1115
1116    /// B6 — crw_map `limit: 0` returns all links, no markers.
1117    #[test]
1118    fn b6_map_limit_zero_is_unbounded() {
1119        let links: Vec<Value> = (0..250)
1120            .map(|i| json!(format!("https://e.com/{i}")))
1121            .collect();
1122        let value = json!({ "links": links });
1123        let out = apply_bounds("crw_map", &json!({ "limit": 0 }), value);
1124        assert_eq!(out["links"].as_array().unwrap().len(), 250);
1125        assert!(out.get("truncated").is_none());
1126    }
1127
1128    /// B7 — crw_check_crawl_status truncates each page in `data`.
1129    #[test]
1130    fn b7_crawl_status_truncates_each_page() {
1131        let value = json!({
1132            "status": "completed",
1133            "data": [
1134                { "markdown": long_md(DEFAULT_MAX_LENGTH + 100), "url": "https://e.com/1" },
1135                { "markdown": "short", "url": "https://e.com/2" }
1136            ]
1137        });
1138        let out = apply_bounds("crw_check_crawl_status", &json!({}), value);
1139        let pages = out["data"].as_array().unwrap();
1140        assert_eq!(pages[0]["truncated"], json!(true));
1141        assert!(
1142            pages[0]["markdown"]
1143                .as_str()
1144                .unwrap()
1145                .contains("[truncated")
1146        );
1147        assert!(pages[1].get("truncated").is_none());
1148        assert_eq!(pages[1]["markdown"], json!("short"));
1149    }
1150
1151    /// B8 — truncation cuts on a char boundary (no panic on multibyte input).
1152    #[test]
1153    fn b8_truncation_is_char_safe() {
1154        let value = json!({ "markdown": "é".repeat(100) });
1155        let out = apply_bounds("crw_scrape", &json!({ "maxLength": 10 }), value);
1156        // Must not panic and must keep exactly 10 'é' chars before the marker.
1157        assert!(
1158            out["markdown"]
1159                .as_str()
1160                .unwrap()
1161                .starts_with(&"é".repeat(10))
1162        );
1163    }
1164
1165    /// B9 — strip removes MCP-only args per tool, but keeps crw_search's real `limit`.
1166    #[test]
1167    fn b9_strip_mcp_only_args() {
1168        let scrape = strip_mcp_only_args("crw_scrape", json!({ "url": "u", "maxLength": 100 }));
1169        assert!(scrape.get("maxLength").is_none());
1170        assert_eq!(scrape["url"], json!("u"));
1171
1172        let map = strip_mcp_only_args("crw_map", json!({ "url": "u", "limit": 50 }));
1173        assert!(map.get("limit").is_none());
1174
1175        // crw_search.limit is a real backend param — must NOT be stripped.
1176        let search = strip_mcp_only_args("crw_search", json!({ "query": "q", "limit": 5 }));
1177        assert_eq!(search["limit"], json!(5));
1178    }
1179
1180    /// B10 — unknown/other tools pass through apply_bounds unchanged.
1181    #[test]
1182    fn b10_unknown_tool_passthrough() {
1183        let value = json!({ "anything": [1, 2, 3] });
1184        let out = apply_bounds("crw_crawl", &json!({}), value.clone());
1185        assert_eq!(out, value);
1186    }
1187
1188    /// B11 — PROXY shape: crw_scrape `ApiResponse<ScrapeData>` envelope
1189    /// (`{success, data:{markdown}}`) is truncated under `data`, not skipped.
1190    #[test]
1191    fn b11_scrape_proxy_envelope_is_bounded() {
1192        let value = json!({
1193            "success": true,
1194            "data": { "markdown": long_md(DEFAULT_MAX_LENGTH + 500), "url": "https://e.com" }
1195        });
1196        let out = apply_bounds("crw_scrape", &json!({}), value);
1197        let md = out["data"]["markdown"].as_str().unwrap();
1198        assert!(
1199            md.contains("[truncated"),
1200            "proxy-enveloped scrape must be bounded"
1201        );
1202        assert_eq!(out["data"]["truncated"], json!(true));
1203    }
1204
1205    /// B12 — PROXY shape: crw_map `ApiResponse<MapData>` envelope
1206    /// (`{success, data:{links}}`) is truncated under `data`.
1207    #[test]
1208    fn b12_map_proxy_envelope_is_bounded() {
1209        let links: Vec<Value> = (0..250)
1210            .map(|i| json!(format!("https://e.com/{i}")))
1211            .collect();
1212        let value = json!({ "success": true, "data": { "links": links } });
1213        let out = apply_bounds("crw_map", &json!({}), value);
1214        assert_eq!(
1215            out["data"]["links"].as_array().unwrap().len(),
1216            DEFAULT_MAP_LIMIT
1217        );
1218        assert_eq!(out["data"]["totalDiscovered"], json!(250));
1219        assert_eq!(out["data"]["truncated"], json!(true));
1220    }
1221
1222    /// A1 — every tool advertises annotations + a title; crw_crawl is the only
1223    /// non-read-only / non-idempotent tool; crw_parse_file is the only closed-world.
1224    #[test]
1225    fn a1_tools_advertise_annotations_and_title() {
1226        let defs = tool_definitions(false);
1227        for t in defs["tools"].as_array().unwrap() {
1228            assert!(t["annotations"].is_object(), "{} annotations", t["name"]);
1229            assert!(t["title"].is_string(), "{} title", t["name"]);
1230            // destructiveHint is explicitly false everywhere (the JSON default is true).
1231            assert_eq!(
1232                t["annotations"]["destructiveHint"],
1233                json!(false),
1234                "{}",
1235                t["name"]
1236            );
1237        }
1238        let crawl = tool_by_name(&defs, "crw_crawl");
1239        assert_eq!(crawl["annotations"]["readOnlyHint"], json!(false));
1240        assert_eq!(crawl["annotations"]["idempotentHint"], json!(false));
1241        let scrape = tool_by_name(&defs, "crw_scrape");
1242        assert_eq!(scrape["annotations"]["readOnlyHint"], json!(true));
1243        assert_eq!(scrape["annotations"]["openWorldHint"], json!(true));
1244        let parse = tool_by_name(&defs, "crw_parse_file");
1245        assert_eq!(parse["annotations"]["openWorldHint"], json!(false));
1246    }
1247
1248    /// A2 — is_known_tool recognizes all 6 tool names, rejects others.
1249    #[test]
1250    fn a2_is_known_tool() {
1251        for name in [
1252            "crw_scrape",
1253            "crw_crawl",
1254            "crw_check_crawl_status",
1255            "crw_map",
1256            "crw_search",
1257            "crw_parse_file",
1258        ] {
1259            assert!(is_known_tool(name), "{name} should be known");
1260        }
1261        assert!(!is_known_tool("nonexistent"));
1262        assert!(!is_known_tool(""));
1263    }
1264
1265    /// A3 — tools/list suppresses crw_search when no backend; includes it otherwise.
1266    #[test]
1267    fn a3_tools_list_conditional_search() {
1268        fn list(search_available: bool) -> Vec<String> {
1269            let req = JsonRpcRequest {
1270                jsonrpc: "2.0".into(),
1271                id: Some(json!(1)),
1272                method: "tools/list".into(),
1273                params: json!({}),
1274            };
1275            let ProtocolResult::Response(resp) =
1276                handle_protocol_method("crw", "0", &req, false, search_available)
1277            else {
1278                panic!("expected response");
1279            };
1280            resp.result.unwrap()["tools"]
1281                .as_array()
1282                .unwrap()
1283                .iter()
1284                .map(|t| t["name"].as_str().unwrap().to_string())
1285                .collect()
1286        }
1287        let with = list(true);
1288        assert!(with.contains(&"crw_search".to_string()));
1289        assert_eq!(with.len(), 6);
1290        let without = list(false);
1291        assert!(!without.contains(&"crw_search".to_string()));
1292        assert_eq!(without.len(), 5);
1293    }
1294
1295    /// B13 — crw_search inlined scrape content (flat + grouped) is truncated.
1296    #[test]
1297    fn b13_search_inlined_content_is_bounded() {
1298        // Flat results with inlined markdown.
1299        let flat = json!({
1300            "success": true,
1301            "data": { "results": [
1302                { "url": "https://e.com/1", "markdown": long_md(DEFAULT_MAX_LENGTH + 100) },
1303                { "url": "https://e.com/2", "description": "no scrape content" }
1304            ]}
1305        });
1306        let out = apply_bounds("crw_search", &json!({}), flat);
1307        assert!(
1308            out["data"]["results"][0]["markdown"]
1309                .as_str()
1310                .unwrap()
1311                .contains("[truncated")
1312        );
1313        assert_eq!(out["data"]["results"][0]["truncated"], json!(true));
1314        assert!(out["data"]["results"][1].get("truncated").is_none());
1315
1316        // Grouped results.
1317        let grouped = json!({
1318            "success": true,
1319            "data": { "results": {
1320                "web": [{ "url": "https://e.com/w", "html": long_md(DEFAULT_MAX_LENGTH + 100) }],
1321                "news": [{ "url": "https://e.com/n", "description": "short" }]
1322            }}
1323        });
1324        let out = apply_bounds("crw_search", &json!({}), grouped);
1325        assert_eq!(out["data"]["results"]["web"][0]["truncated"], json!(true));
1326        assert!(out["data"]["results"]["news"][0].get("truncated").is_none());
1327    }
1328}
crw_mcp_proto/lib.rs

crw_mcp_proto/
lib.rs