1use serde::{Deserialize, Serialize};
6use serde_json::{Value, json};
7
8pub const PROTOCOL_VERSION: &str = "2025-06-18";
18
19#[derive(Deserialize)]
22pub struct JsonRpcRequest {
23 pub jsonrpc: String,
24 pub id: Option<Value>,
25 pub method: String,
26 #[serde(default)]
27 pub params: Value,
28}
29
30#[derive(Serialize)]
31pub struct JsonRpcResponse {
32 pub jsonrpc: String,
33 pub id: Value,
34 #[serde(skip_serializing_if = "Option::is_none")]
35 pub result: Option<Value>,
36 #[serde(skip_serializing_if = "Option::is_none")]
37 pub error: Option<JsonRpcError>,
38}
39
40#[derive(Serialize)]
41pub struct JsonRpcError {
42 pub code: i64,
43 pub message: String,
44}
45
46impl JsonRpcResponse {
47 pub fn success(id: Value, result: Value) -> Self {
48 Self {
49 jsonrpc: "2.0".into(),
50 id,
51 result: Some(result),
52 error: None,
53 }
54 }
55
56 pub fn error(id: Value, code: i64, message: String) -> Self {
57 Self {
58 jsonrpc: "2.0".into(),
59 id,
60 result: None,
61 error: Some(JsonRpcError { code, message }),
62 }
63 }
64}
65
66pub fn tool_definitions(proxy_mode: bool) -> Value {
69 let mut tools = vec![
70 json!({
71 "name": "crw_scrape",
72 "title": "Scrape URL",
73 "description": "Scrape one URL to markdown, HTML, or links.",
74 "annotations": {
75 "readOnlyHint": true,
76 "destructiveHint": false,
77 "idempotentHint": true,
78 "openWorldHint": true
79 },
80 "inputSchema": {
81 "type": "object",
82 "properties": {
83 "url": {
84 "type": "string",
85 "description": "URL to scrape"
86 },
87 "formats": {
88 "type": "array",
89 "items": { "type": "string", "enum": ["markdown", "html", "links"] },
90 "description": "Output formats (default [\"markdown\"])"
91 },
92 "onlyMainContent": {
93 "type": "boolean",
94 "description": "Strip nav/footer; main content only (default true)"
95 },
96 "includeTags": {
97 "type": "array",
98 "items": { "type": "string" },
99 "description": "CSS selectors to include"
100 },
101 "excludeTags": {
102 "type": "array",
103 "items": { "type": "string" },
104 "description": "CSS selectors to exclude"
105 },
106 "renderJs": {
107 "type": "boolean",
108 "description": "Force JS render (true), HTTP-only (false), omit = auto"
109 },
110 "waitFor": {
111 "type": "integer",
112 "description": "Ms to wait after JS render for late content"
113 },
114 "maxLength": {
115 "type": "integer",
116 "minimum": 0,
117 "description": "Max chars per content field; 0 = unbounded (default ~15000)"
118 },
119 "renderer": {
120 "type": "string",
121 "enum": ["auto", "lightpanda", "chrome", "playwright"],
122 "description": "Pin renderer; non-auto hard-pins and implies renderJs:true (default auto)"
123 }
124 },
125 "required": ["url"]
126 }
127 }),
128 json!({
129 "name": "crw_crawl",
130 "title": "Crawl site",
131 "description": "Start an async site crawl; returns a job id to poll with crw_check_crawl_status.",
132 "annotations": {
135 "readOnlyHint": false,
136 "destructiveHint": false,
137 "idempotentHint": false,
138 "openWorldHint": true
139 },
140 "inputSchema": {
141 "type": "object",
142 "properties": {
143 "url": {
144 "type": "string",
145 "description": "Starting URL"
146 },
147 "maxDepth": {
148 "type": "integer",
149 "description": "Max crawl depth (default 2)"
150 },
151 "maxPages": {
152 "type": "integer",
153 "description": "Max pages to crawl (default 10)"
154 },
155 "jsonSchema": {
156 "type": "object",
157 "description": "JSON schema for LLM extraction per page"
158 },
159 "renderJs": {
160 "type": "boolean",
161 "description": "Force JS render (true), HTTP-only (false), omit = auto"
162 },
163 "waitFor": {
164 "type": "integer",
165 "description": "Ms to wait after JS render per page"
166 },
167 "renderer": {
168 "type": "string",
169 "enum": ["auto", "lightpanda", "chrome", "playwright"],
170 "description": "Pin renderer; non-auto hard-pins and implies renderJs:true (default auto)"
171 }
172 },
173 "required": ["url"]
174 }
175 }),
176 json!({
177 "name": "crw_check_crawl_status",
178 "title": "Check crawl status",
179 "description": "Poll an async crawl job and retrieve its pages.",
180 "annotations": {
181 "readOnlyHint": true,
182 "destructiveHint": false,
183 "idempotentHint": true,
184 "openWorldHint": true
185 },
186 "inputSchema": {
187 "type": "object",
188 "properties": {
189 "id": {
190 "type": "string",
191 "description": "Crawl job id from crw_crawl"
192 },
193 "maxLength": {
194 "type": "integer",
195 "minimum": 0,
196 "description": "Max chars per page content field; 0 = unbounded (default ~15000)"
197 }
198 },
199 "required": ["id"]
200 }
201 }),
202 json!({
203 "name": "crw_map",
204 "title": "Map site URLs",
205 "description": "Discover URLs on a site via sitemap and/or a short crawl. Returns a URL list only, no page content.",
206 "annotations": {
207 "readOnlyHint": true,
208 "destructiveHint": false,
209 "idempotentHint": true,
210 "openWorldHint": true
211 },
212 "inputSchema": {
213 "type": "object",
214 "properties": {
215 "url": {
216 "type": "string",
217 "description": "URL to map"
218 },
219 "maxDepth": {
220 "type": "integer",
221 "description": "Max discovery depth (default 2)"
222 },
223 "useSitemap": {
224 "type": "boolean",
225 "description": "Use sitemap.xml (default true)"
226 },
227 "crawlFallback": {
228 "type": "boolean",
229 "description": "Supplement sitemap with a short BFS crawl (default true; false = sitemap-only)"
230 },
231 "limit": {
232 "type": "integer",
233 "minimum": 0,
234 "description": "Max URLs returned; 0 = unbounded (default 100)"
235 }
236 },
237 "required": ["url"]
238 }
239 }),
240 ];
241
242 let _ = proxy_mode;
248 tools.push(json!({
249 "name": "crw_search",
250 "title": "Web search",
251 "description": "Search the web (needs a configured search backend; embedded uses a local SearXNG sidecar). Returns results with url/title/description/snippet.",
252 "annotations": {
253 "readOnlyHint": true,
254 "destructiveHint": false,
255 "idempotentHint": true,
256 "openWorldHint": true
257 },
258 "inputSchema": {
259 "type": "object",
260 "properties": {
261 "query": {
262 "type": "string",
263 "description": "Search query"
264 },
265 "limit": {
266 "type": "integer",
267 "description": "Max results (default 5, max 20)"
268 },
269 "lang": {
270 "type": "string",
271 "description": "Language code, e.g. \"en\", \"tr\""
272 },
273 "country": {
274 "type": "string",
275 "description": "Country code hint, e.g. \"us\", \"tr\""
276 },
277 "tbs": {
278 "type": "string",
279 "enum": ["qdr:h", "qdr:d", "qdr:w", "qdr:m", "qdr:y"],
280 "description": "Time filter: past hour/day/week/month/year"
281 },
282 "sources": {
283 "type": "array",
284 "items": { "type": "string", "enum": ["web", "news", "images"] },
285 "description": "If set, group results by source instead of a flat list"
286 },
287 "categories": {
288 "type": "array",
289 "items": { "type": "string" },
290 "description": "Category bias; e.g. \"pdf\", \"github\", \"research\", or a native SearXNG category"
291 },
292 "scrapeOptions": {
293 "type": "object",
294 "description": "If set, scrape each web result and inline the requested formats",
295 "properties": {
296 "formats": {
297 "type": "array",
298 "items": { "type": "string", "enum": ["markdown", "html", "rawHtml", "links"] }
299 },
300 "onlyMainContent": {
301 "type": "boolean",
302 "description": "Strip nav/footer/ads (default true)"
303 }
304 }
305 }
306 },
307 "required": ["query"]
308 },
309 "outputSchema": {
318 "type": "object",
319 "properties": {
320 "success": { "type": "boolean" },
321 "data": {
322 "type": "object",
323 "properties": {
324 "results": {
325 "oneOf": [
326 { "type": "array", "items": { "type": "object" } },
327 { "type": "object" }
328 ]
329 }
330 },
331 "required": ["results"]
332 }
333 },
334 "required": ["success", "data"]
335 }
336 }));
337
338 tools.push(json!({
339 "name": "crw_parse_file",
340 "title": "Parse PDF",
341 "description": "Parse a local PDF (base64 in contentBase64) to markdown. No OCR: scanned PDFs return empty markdown with a warning.",
342 "annotations": {
344 "readOnlyHint": true,
345 "destructiveHint": false,
346 "idempotentHint": true,
347 "openWorldHint": false
348 },
349 "inputSchema": {
350 "type": "object",
351 "properties": {
352 "contentBase64": {
353 "type": "string",
354 "description": "Base64-encoded PDF bytes"
355 },
356 "filename": {
357 "type": "string",
358 "description": "Original filename (optional)"
359 },
360 "formats": {
361 "type": "array",
362 "items": { "type": "string", "enum": ["markdown", "plainText", "links", "json", "summary"] },
363 "description": "Output formats (default [\"markdown\"]); json/summary need a server LLM"
364 },
365 "jsonSchema": {
366 "type": "object",
367 "description": "JSON schema for LLM extraction (when formats has json)"
368 },
369 "parsers": {
370 "type": "array",
371 "items": { "type": "string", "enum": ["pdf"] },
372 "description": "Parsers to apply (default [\"pdf\"])"
373 },
374 "maxLength": {
375 "type": "integer",
376 "minimum": 0,
377 "description": "Max chars per content field; 0 = unbounded (default ~15000)"
378 }
379 },
380 "required": ["contentBase64"]
381 }
382 }));
383
384 json!({ "tools": tools })
385}
386
387pub fn tool_output_schema(tool_name: &str) -> Option<Value> {
394 tool_definitions(false)["tools"]
395 .as_array()?
396 .iter()
397 .find(|t| t["name"] == tool_name)
398 .and_then(|t| t.get("outputSchema").cloned())
399}
400
401pub fn is_known_tool(name: &str) -> bool {
407 tool_definitions(false)["tools"]
408 .as_array()
409 .is_some_and(|tools| tools.iter().any(|t| t["name"] == name))
410}
411
412pub enum ProtocolResult {
414 Response(JsonRpcResponse),
416 Notification,
418 NotHandled,
420}
421
422pub fn handle_protocol_method(
429 server_name: &str,
430 server_version: &str,
431 req: &JsonRpcRequest,
432 proxy_mode: bool,
433 search_available: bool,
434) -> ProtocolResult {
435 if req.jsonrpc != "2.0" {
436 let id = req.id.clone().unwrap_or(Value::Null);
437 return ProtocolResult::Response(JsonRpcResponse::error(
438 id,
439 -32600,
440 "invalid jsonrpc version".into(),
441 ));
442 }
443
444 match req.method.as_str() {
445 "notifications/initialized" | "notifications/cancelled" => ProtocolResult::Notification,
446
447 "initialize" => {
448 let id = req.id.clone().unwrap_or(Value::Null);
449 ProtocolResult::Response(JsonRpcResponse::success(
450 id,
451 json!({
452 "protocolVersion": PROTOCOL_VERSION,
453 "capabilities": { "tools": { "listChanged": false } },
456 "serverInfo": {
457 "name": server_name,
458 "version": server_version
459 }
460 }),
461 ))
462 }
463
464 "tools/list" => {
465 let id = req.id.clone().unwrap_or(Value::Null);
466 let mut defs = tool_definitions(proxy_mode);
467 if !search_available
468 && let Some(tools) = defs.get_mut("tools").and_then(Value::as_array_mut)
469 {
470 tools.retain(|t| t["name"] != "crw_search");
471 }
472 ProtocolResult::Response(JsonRpcResponse::success(id, defs))
473 }
474
475 "ping" => {
476 let id = req.id.clone().unwrap_or(Value::Null);
477 ProtocolResult::Response(JsonRpcResponse::success(id, json!({})))
478 }
479
480 _ => ProtocolResult::NotHandled,
481 }
482}
483
484pub fn tool_result_response(
494 id: Value,
495 tool_name: &str,
496 result: Result<Value, String>,
497) -> JsonRpcResponse {
498 match result {
499 Ok(value) => {
500 let text = serde_json::to_string(&value).unwrap_or_default();
503 let mut payload = json!({
504 "content": [{"type": "text", "text": text}]
505 });
506 if value.is_object() && tool_output_schema(tool_name).is_some() {
515 payload["structuredContent"] = value;
516 }
517 JsonRpcResponse::success(id, payload)
518 }
519 Err(e) => JsonRpcResponse::success(
523 id,
524 json!({
525 "content": [{"type": "text", "text": e}],
526 "isError": true
527 }),
528 ),
529 }
530}
531
532pub const DEFAULT_MAX_LENGTH: usize = 15_000;
537pub const DEFAULT_MAP_LIMIT: usize = 100;
539
540const SCRAPE_TEXT_FIELDS: &[&str] = &["markdown", "html", "rawHtml", "plainText", "summary"];
542
543fn resolve_bound(args: &Value, key: &str, default: usize) -> Option<usize> {
548 match args.get(key).and_then(Value::as_u64) {
549 None => Some(default),
550 Some(0) => None,
551 Some(n) => Some(n as usize),
552 }
553}
554
555fn truncate_to_chars(s: &str, max_chars: usize) -> Option<String> {
558 s.char_indices()
561 .nth(max_chars)
562 .map(|(byte_idx, _)| format!("{}\n…[truncated by crw-mcp maxLength]", &s[..byte_idx]))
563}
564
565fn truncate_scrape_obj(value: &mut Value, max: usize) {
568 let Some(obj) = value.as_object_mut() else {
569 return;
570 };
571 let mut any = false;
572 for field in SCRAPE_TEXT_FIELDS {
573 let cut = match obj.get(*field) {
574 Some(Value::String(s)) => truncate_to_chars(s, max),
575 _ => None,
576 };
577 if let Some(t) = cut {
578 obj.insert((*field).to_string(), Value::String(t));
579 any = true;
580 }
581 }
582 if any {
583 obj.insert("truncated".to_string(), Value::Bool(true));
584 }
585}
586
587fn scrape_target_mut(value: &mut Value) -> Option<&mut Value> {
592 if value.get("data").is_some_and(Value::is_object) {
593 value.get_mut("data")
594 } else if value.is_object() {
595 Some(value)
596 } else {
597 None
598 }
599}
600
601fn bound_map_links(value: &mut Value, limit: usize) {
605 let in_envelope = value.get("data").and_then(|d| d.get("links")).is_some();
606 let Some(container) = (if in_envelope {
607 value.get_mut("data")
608 } else {
609 Some(&mut *value)
610 }) else {
611 return;
612 };
613 let Some(total) = container
614 .get("links")
615 .and_then(Value::as_array)
616 .map(Vec::len)
617 else {
618 return;
619 };
620 if total <= limit {
621 return;
622 }
623 if let Some(obj) = container.as_object_mut() {
624 if let Some(Value::Array(links)) = obj.get_mut("links") {
625 links.truncate(limit);
626 }
627 obj.insert("totalDiscovered".to_string(), json!(total));
628 obj.insert("truncated".to_string(), Value::Bool(true));
629 }
630}
631
632fn bound_search_results(value: &mut Value, max: usize) {
636 let Some(results) = value.get_mut("data").and_then(|d| d.get_mut("results")) else {
637 return;
638 };
639 match results {
640 Value::Array(items) => {
641 for item in items.iter_mut() {
642 truncate_scrape_obj(item, max);
643 }
644 }
645 Value::Object(groups) => {
646 for arr in groups.values_mut() {
647 if let Some(items) = arr.as_array_mut() {
648 for item in items.iter_mut() {
649 truncate_scrape_obj(item, max);
650 }
651 }
652 }
653 }
654 _ => {}
655 }
656}
657
658pub fn apply_bounds(tool_name: &str, args: &Value, mut value: Value) -> Value {
665 match tool_name {
666 "crw_scrape" | "crw_parse_file" => {
667 if let Some(max) = resolve_bound(args, "maxLength", DEFAULT_MAX_LENGTH)
668 && let Some(target) = scrape_target_mut(&mut value)
669 {
670 truncate_scrape_obj(target, max);
671 }
672 }
673 "crw_check_crawl_status" => {
674 if let Some(max) = resolve_bound(args, "maxLength", DEFAULT_MAX_LENGTH)
677 && let Some(pages) = value.get_mut("data").and_then(Value::as_array_mut)
678 {
679 for page in pages.iter_mut() {
680 truncate_scrape_obj(page, max);
681 }
682 }
683 }
684 "crw_map" => {
685 if let Some(limit) = resolve_bound(args, "limit", DEFAULT_MAP_LIMIT) {
686 bound_map_links(&mut value, limit);
687 }
688 }
689 "crw_search" => {
690 if let Some(max) = resolve_bound(args, "maxLength", DEFAULT_MAX_LENGTH) {
691 bound_search_results(&mut value, max);
692 }
693 }
694 _ => {}
695 }
696 value
697}
698
699pub fn strip_mcp_only_args(tool_name: &str, mut args: Value) -> Value {
704 if let Some(obj) = args.as_object_mut() {
705 match tool_name {
706 "crw_scrape" | "crw_parse_file" | "crw_check_crawl_status" => {
707 obj.remove("maxLength");
708 }
709 "crw_map" => {
710 obj.remove("limit");
711 }
712 _ => {}
713 }
714 }
715 args
716}
717
718#[cfg(test)]
719mod tests {
720 use super::*;
721
722 fn tool_by_name<'a>(tools: &'a Value, name: &str) -> &'a Value {
723 tools["tools"]
724 .as_array()
725 .expect("tools array")
726 .iter()
727 .find(|t| t["name"] == name)
728 .unwrap_or_else(|| panic!("tool {name} not found"))
729 }
730
731 const TOOLS_LIST_TOKEN_CEILING: usize = 2300;
747
748 #[test]
749 fn tools_list_token_budget() {
750 let json = serde_json::to_string(&tool_definitions(false)).unwrap();
751 let est_tokens = json.len().div_ceil(3);
752 assert!(
753 est_tokens <= TOOLS_LIST_TOKEN_CEILING,
754 "tools/list footprint regressed: {} bytes ≈ {} est-tokens (ceiling {}). \
755 Trim descriptions/schemas before raising the ceiling.",
756 json.len(),
757 est_tokens,
758 TOOLS_LIST_TOKEN_CEILING
759 );
760 }
761
762 #[test]
763 fn crw_scrape_schema_advertises_render_js() {
764 let defs = tool_definitions(false);
765 let scrape = tool_by_name(&defs, "crw_scrape");
766 let props = &scrape["inputSchema"]["properties"];
767 assert_eq!(
768 props["renderJs"]["type"], "boolean",
769 "renderJs must be a plain boolean in the advertised schema"
770 );
771 assert!(
772 props["renderJs"].get("default").is_none(),
773 "renderJs must not advertise a default — server resolves it"
774 );
775 }
776
777 #[test]
778 fn crw_scrape_schema_advertises_wait_for() {
779 let defs = tool_definitions(false);
780 let scrape = tool_by_name(&defs, "crw_scrape");
781 let props = &scrape["inputSchema"]["properties"];
782 assert_eq!(props["waitFor"]["type"], "integer");
783 }
784
785 #[test]
786 fn crw_scrape_render_js_not_required() {
787 let defs = tool_definitions(false);
788 let scrape = tool_by_name(&defs, "crw_scrape");
789 let required = scrape["inputSchema"]["required"]
790 .as_array()
791 .expect("required array");
792 assert!(
793 !required.iter().any(|v| v == "renderJs"),
794 "renderJs must not be in required"
795 );
796 assert!(
797 !required.iter().any(|v| v == "waitFor"),
798 "waitFor must not be in required"
799 );
800 }
801
802 #[test]
803 fn crw_crawl_schema_advertises_render_js_and_wait_for() {
804 let defs = tool_definitions(false);
805 let crawl = tool_by_name(&defs, "crw_crawl");
806 let props = &crawl["inputSchema"]["properties"];
807 assert_eq!(props["renderJs"]["type"], "boolean");
808 assert_eq!(props["waitFor"]["type"], "integer");
809 }
810
811 #[test]
812 fn crw_scrape_schema_advertises_renderer() {
813 let defs = tool_definitions(false);
814 let scrape = tool_by_name(&defs, "crw_scrape");
815 let props = &scrape["inputSchema"]["properties"];
816 assert_eq!(props["renderer"]["type"], "string");
817 let enum_vals = props["renderer"]["enum"]
818 .as_array()
819 .expect("renderer.enum must be an array");
820 assert_eq!(
821 enum_vals,
822 &vec![
823 json!("auto"),
824 json!("lightpanda"),
825 json!("chrome"),
826 json!("playwright"),
827 ]
828 );
829 }
830
831 #[test]
832 fn crw_scrape_renderer_not_required() {
833 let defs = tool_definitions(false);
834 let scrape = tool_by_name(&defs, "crw_scrape");
835 let required = scrape["inputSchema"]["required"]
836 .as_array()
837 .expect("required array");
838 assert!(!required.iter().any(|v| v == "renderer"));
839 }
840
841 #[test]
842 fn crw_crawl_schema_advertises_renderer() {
843 let defs = tool_definitions(false);
844 let crawl = tool_by_name(&defs, "crw_crawl");
845 let props = &crawl["inputSchema"]["properties"];
846 assert_eq!(props["renderer"]["type"], "string");
847 let enum_vals = props["renderer"]["enum"]
848 .as_array()
849 .expect("renderer.enum must be an array");
850 assert_eq!(enum_vals.len(), 4);
851 assert!(enum_vals.iter().any(|v| v == "chrome"));
852 assert!(enum_vals.iter().any(|v| v == "lightpanda"));
853 assert!(enum_vals.iter().any(|v| v == "auto"));
854 assert!(enum_vals.iter().any(|v| v == "playwright"));
855 }
856
857 #[test]
858 fn schemas_do_not_set_additional_properties_false() {
859 let defs = tool_definitions(false);
862 for name in ["crw_scrape", "crw_crawl", "crw_map"] {
863 let tool = tool_by_name(&defs, name);
864 let ap = &tool["inputSchema"].get("additionalProperties");
865 assert!(
866 ap.is_none() || ap.as_ref().and_then(|v| v.as_bool()) != Some(false),
867 "{name}: additionalProperties:false must remain off until schemas are complete"
868 );
869 }
870 }
871
872 fn search_result_item(idx: u32) -> Value {
878 json!({
879 "url": format!("https://example.com/{idx}"),
880 "title": format!("Result {idx}"),
881 "description": "body text",
882 "snippet": "body text",
883 "position": idx,
884 "score": 4.0,
885 "category": "general"
886 })
887 }
888
889 fn representative_search_value() -> Value {
892 json!({
893 "success": true,
894 "data": { "results": [search_result_item(1), search_result_item(2)] }
895 })
896 }
897
898 fn grouped_search_value() -> Value {
901 json!({
902 "success": true,
903 "data": { "results": {
904 "web": [search_result_item(1)],
905 "news": [search_result_item(2)],
906 "images": [{
907 "url": "https://example.com/img",
908 "title": "An image",
909 "description": "alt text",
910 "imageUrl": "https://example.com/img.png",
911 "position": 1
912 }]
913 }}
914 })
915 }
916
917 fn result_of(resp: &JsonRpcResponse) -> &Value {
918 resp.result.as_ref().expect("success response has result")
919 }
920
921 #[test]
924 fn t1_search_emits_dual_content_in_sync() {
925 let repr = representative_search_value();
926 let resp = tool_result_response(json!(1), "crw_search", Ok(repr.clone()));
927 let result = result_of(&resp);
928
929 let text = result["content"][0]["text"]
930 .as_str()
931 .expect("text content present");
932 assert_eq!(
933 result["content"][0]["type"], "text",
934 "first content block is text"
935 );
936
937 let structured = &result["structuredContent"];
938 assert!(!structured.is_null(), "structuredContent present");
939 assert_eq!(
940 structured, &repr,
941 "structuredContent is the unmodified value"
942 );
943
944 let from_text: Value = serde_json::from_str(text).expect("text is valid JSON");
945 assert_eq!(
946 &from_text, structured,
947 "from_str(content.text) == structuredContent (no drift)"
948 );
949 }
950
951 #[test]
954 fn t2_scrape_has_no_structured_content() {
955 let resp = tool_result_response(json!(1), "crw_scrape", Ok(json!({"markdown": "hi"})));
956 let result = result_of(&resp);
957 assert!(result["content"][0]["text"].is_string());
958 assert!(
959 result.get("structuredContent").is_none(),
960 "crw_scrape declares no outputSchema → no structuredContent"
961 );
962 }
963
964 #[test]
968 fn t2b_non_object_search_value_degrades_to_text() {
969 for non_object in [json!("upstream error string"), json!([{ "url": "x" }])] {
970 let resp = tool_result_response(json!(1), "crw_search", Ok(non_object.clone()));
971 let result = result_of(&resp);
972 assert!(
973 result["content"][0]["text"].is_string(),
974 "text block carries the body"
975 );
976 assert!(
977 result.get("structuredContent").is_none(),
978 "non-object Ok value must NOT emit structuredContent: {non_object}"
979 );
980 }
981 }
982
983 #[test]
985 fn t3_error_path_has_no_structured_content() {
986 let resp = tool_result_response(json!(1), "crw_search", Err("boom".into()));
987 let result = result_of(&resp);
988 assert_eq!(result["isError"], true);
989 assert_eq!(result["content"][0]["text"], "boom");
990 assert!(result.get("structuredContent").is_none());
991 }
992
993 #[test]
997 fn t4_emitted_structured_content_validates_against_schema() {
998 let schema = tool_output_schema("crw_search").expect("crw_search has outputSchema");
999 let validator = jsonschema::validator_for(&schema).expect("schema compiles");
1000
1001 for value in [representative_search_value(), grouped_search_value()] {
1002 let resp = tool_result_response(json!(1), "crw_search", Ok(value.clone()));
1003 let structured = result_of(&resp)["structuredContent"].clone();
1004 let errors: Vec<String> = validator
1005 .iter_errors(&structured)
1006 .map(|e| e.to_string())
1007 .collect();
1008 assert!(
1009 errors.is_empty(),
1010 "structuredContent failed schema validation for {value}:\n{}",
1011 errors.join("\n")
1012 );
1013 }
1014 }
1015
1016 #[test]
1019 fn t5_tool_output_schema_helper() {
1020 let schema = tool_output_schema("crw_search").expect("crw_search has outputSchema");
1021 assert_eq!(schema["type"], "object");
1022 let required = schema["required"].as_array().expect("required array");
1023 assert_eq!(required, &vec![json!("success"), json!("data")]);
1024 assert_eq!(schema["properties"]["data"]["type"], "object");
1025 let data_required = schema["properties"]["data"]["required"]
1026 .as_array()
1027 .expect("data.required array");
1028 assert!(data_required.iter().any(|v| v == "results"));
1029
1030 assert!(
1031 tool_output_schema("crw_scrape").is_none(),
1032 "crw_scrape declares no outputSchema"
1033 );
1034 }
1035
1036 #[test]
1040 fn t6_output_schema_does_not_set_additional_properties_false() {
1041 let defs = tool_definitions(false);
1042 let search = tool_by_name(&defs, "crw_search");
1043 let ap = search["outputSchema"].get("additionalProperties");
1044 assert!(
1045 ap.is_none() || ap.and_then(|v| v.as_bool()) != Some(false),
1046 "crw_search outputSchema must not set additionalProperties:false"
1047 );
1048 }
1049
1050 fn long_md(chars: usize) -> String {
1053 "x".repeat(chars)
1054 }
1055
1056 #[test]
1058 fn b1_scrape_truncates_to_default_max_length() {
1059 let value =
1060 json!({ "markdown": long_md(DEFAULT_MAX_LENGTH + 500), "url": "https://e.com" });
1061 let out = apply_bounds("crw_scrape", &json!({}), value);
1062 let md = out["markdown"].as_str().unwrap();
1063 assert!(
1064 md.chars().count() <= DEFAULT_MAX_LENGTH + 40,
1065 "truncated to ~cap + marker"
1066 );
1067 assert!(md.contains("[truncated"), "marker present");
1068 assert_eq!(out["truncated"], json!(true));
1069 }
1070
1071 #[test]
1073 fn b2_scrape_short_content_untouched() {
1074 let value = json!({ "markdown": "hello", "url": "https://e.com" });
1075 let out = apply_bounds("crw_scrape", &json!({}), value);
1076 assert_eq!(out["markdown"], json!("hello"));
1077 assert!(out.get("truncated").is_none());
1078 }
1079
1080 #[test]
1082 fn b3_scrape_max_length_zero_is_unbounded() {
1083 let big = long_md(DEFAULT_MAX_LENGTH * 2);
1084 let value = json!({ "markdown": big.clone() });
1085 let out = apply_bounds("crw_scrape", &json!({ "maxLength": 0 }), value);
1086 assert_eq!(
1087 out["markdown"].as_str().unwrap().chars().count(),
1088 big.chars().count()
1089 );
1090 assert!(out.get("truncated").is_none());
1091 }
1092
1093 #[test]
1095 fn b4_scrape_custom_max_length() {
1096 let value = json!({ "markdown": long_md(100) });
1097 let out = apply_bounds("crw_scrape", &json!({ "maxLength": 10 }), value);
1098 let md = out["markdown"].as_str().unwrap();
1099 assert!(md.starts_with(&"x".repeat(10)));
1100 assert!(md.contains("[truncated"));
1101 }
1102
1103 #[test]
1105 fn b5_map_truncates_links_to_limit() {
1106 let links: Vec<Value> = (0..250)
1107 .map(|i| json!(format!("https://e.com/{i}")))
1108 .collect();
1109 let value = json!({ "success": true, "links": links });
1110 let out = apply_bounds("crw_map", &json!({}), value);
1111 assert_eq!(out["links"].as_array().unwrap().len(), DEFAULT_MAP_LIMIT);
1112 assert_eq!(out["totalDiscovered"], json!(250));
1113 assert_eq!(out["truncated"], json!(true));
1114 }
1115
1116 #[test]
1118 fn b6_map_limit_zero_is_unbounded() {
1119 let links: Vec<Value> = (0..250)
1120 .map(|i| json!(format!("https://e.com/{i}")))
1121 .collect();
1122 let value = json!({ "links": links });
1123 let out = apply_bounds("crw_map", &json!({ "limit": 0 }), value);
1124 assert_eq!(out["links"].as_array().unwrap().len(), 250);
1125 assert!(out.get("truncated").is_none());
1126 }
1127
1128 #[test]
1130 fn b7_crawl_status_truncates_each_page() {
1131 let value = json!({
1132 "status": "completed",
1133 "data": [
1134 { "markdown": long_md(DEFAULT_MAX_LENGTH + 100), "url": "https://e.com/1" },
1135 { "markdown": "short", "url": "https://e.com/2" }
1136 ]
1137 });
1138 let out = apply_bounds("crw_check_crawl_status", &json!({}), value);
1139 let pages = out["data"].as_array().unwrap();
1140 assert_eq!(pages[0]["truncated"], json!(true));
1141 assert!(
1142 pages[0]["markdown"]
1143 .as_str()
1144 .unwrap()
1145 .contains("[truncated")
1146 );
1147 assert!(pages[1].get("truncated").is_none());
1148 assert_eq!(pages[1]["markdown"], json!("short"));
1149 }
1150
1151 #[test]
1153 fn b8_truncation_is_char_safe() {
1154 let value = json!({ "markdown": "é".repeat(100) });
1155 let out = apply_bounds("crw_scrape", &json!({ "maxLength": 10 }), value);
1156 assert!(
1158 out["markdown"]
1159 .as_str()
1160 .unwrap()
1161 .starts_with(&"é".repeat(10))
1162 );
1163 }
1164
1165 #[test]
1167 fn b9_strip_mcp_only_args() {
1168 let scrape = strip_mcp_only_args("crw_scrape", json!({ "url": "u", "maxLength": 100 }));
1169 assert!(scrape.get("maxLength").is_none());
1170 assert_eq!(scrape["url"], json!("u"));
1171
1172 let map = strip_mcp_only_args("crw_map", json!({ "url": "u", "limit": 50 }));
1173 assert!(map.get("limit").is_none());
1174
1175 let search = strip_mcp_only_args("crw_search", json!({ "query": "q", "limit": 5 }));
1177 assert_eq!(search["limit"], json!(5));
1178 }
1179
1180 #[test]
1182 fn b10_unknown_tool_passthrough() {
1183 let value = json!({ "anything": [1, 2, 3] });
1184 let out = apply_bounds("crw_crawl", &json!({}), value.clone());
1185 assert_eq!(out, value);
1186 }
1187
1188 #[test]
1191 fn b11_scrape_proxy_envelope_is_bounded() {
1192 let value = json!({
1193 "success": true,
1194 "data": { "markdown": long_md(DEFAULT_MAX_LENGTH + 500), "url": "https://e.com" }
1195 });
1196 let out = apply_bounds("crw_scrape", &json!({}), value);
1197 let md = out["data"]["markdown"].as_str().unwrap();
1198 assert!(
1199 md.contains("[truncated"),
1200 "proxy-enveloped scrape must be bounded"
1201 );
1202 assert_eq!(out["data"]["truncated"], json!(true));
1203 }
1204
1205 #[test]
1208 fn b12_map_proxy_envelope_is_bounded() {
1209 let links: Vec<Value> = (0..250)
1210 .map(|i| json!(format!("https://e.com/{i}")))
1211 .collect();
1212 let value = json!({ "success": true, "data": { "links": links } });
1213 let out = apply_bounds("crw_map", &json!({}), value);
1214 assert_eq!(
1215 out["data"]["links"].as_array().unwrap().len(),
1216 DEFAULT_MAP_LIMIT
1217 );
1218 assert_eq!(out["data"]["totalDiscovered"], json!(250));
1219 assert_eq!(out["data"]["truncated"], json!(true));
1220 }
1221
1222 #[test]
1225 fn a1_tools_advertise_annotations_and_title() {
1226 let defs = tool_definitions(false);
1227 for t in defs["tools"].as_array().unwrap() {
1228 assert!(t["annotations"].is_object(), "{} annotations", t["name"]);
1229 assert!(t["title"].is_string(), "{} title", t["name"]);
1230 assert_eq!(
1232 t["annotations"]["destructiveHint"],
1233 json!(false),
1234 "{}",
1235 t["name"]
1236 );
1237 }
1238 let crawl = tool_by_name(&defs, "crw_crawl");
1239 assert_eq!(crawl["annotations"]["readOnlyHint"], json!(false));
1240 assert_eq!(crawl["annotations"]["idempotentHint"], json!(false));
1241 let scrape = tool_by_name(&defs, "crw_scrape");
1242 assert_eq!(scrape["annotations"]["readOnlyHint"], json!(true));
1243 assert_eq!(scrape["annotations"]["openWorldHint"], json!(true));
1244 let parse = tool_by_name(&defs, "crw_parse_file");
1245 assert_eq!(parse["annotations"]["openWorldHint"], json!(false));
1246 }
1247
1248 #[test]
1250 fn a2_is_known_tool() {
1251 for name in [
1252 "crw_scrape",
1253 "crw_crawl",
1254 "crw_check_crawl_status",
1255 "crw_map",
1256 "crw_search",
1257 "crw_parse_file",
1258 ] {
1259 assert!(is_known_tool(name), "{name} should be known");
1260 }
1261 assert!(!is_known_tool("nonexistent"));
1262 assert!(!is_known_tool(""));
1263 }
1264
1265 #[test]
1267 fn a3_tools_list_conditional_search() {
1268 fn list(search_available: bool) -> Vec<String> {
1269 let req = JsonRpcRequest {
1270 jsonrpc: "2.0".into(),
1271 id: Some(json!(1)),
1272 method: "tools/list".into(),
1273 params: json!({}),
1274 };
1275 let ProtocolResult::Response(resp) =
1276 handle_protocol_method("crw", "0", &req, false, search_available)
1277 else {
1278 panic!("expected response");
1279 };
1280 resp.result.unwrap()["tools"]
1281 .as_array()
1282 .unwrap()
1283 .iter()
1284 .map(|t| t["name"].as_str().unwrap().to_string())
1285 .collect()
1286 }
1287 let with = list(true);
1288 assert!(with.contains(&"crw_search".to_string()));
1289 assert_eq!(with.len(), 6);
1290 let without = list(false);
1291 assert!(!without.contains(&"crw_search".to_string()));
1292 assert_eq!(without.len(), 5);
1293 }
1294
1295 #[test]
1297 fn b13_search_inlined_content_is_bounded() {
1298 let flat = json!({
1300 "success": true,
1301 "data": { "results": [
1302 { "url": "https://e.com/1", "markdown": long_md(DEFAULT_MAX_LENGTH + 100) },
1303 { "url": "https://e.com/2", "description": "no scrape content" }
1304 ]}
1305 });
1306 let out = apply_bounds("crw_search", &json!({}), flat);
1307 assert!(
1308 out["data"]["results"][0]["markdown"]
1309 .as_str()
1310 .unwrap()
1311 .contains("[truncated")
1312 );
1313 assert_eq!(out["data"]["results"][0]["truncated"], json!(true));
1314 assert!(out["data"]["results"][1].get("truncated").is_none());
1315
1316 let grouped = json!({
1318 "success": true,
1319 "data": { "results": {
1320 "web": [{ "url": "https://e.com/w", "html": long_md(DEFAULT_MAX_LENGTH + 100) }],
1321 "news": [{ "url": "https://e.com/n", "description": "short" }]
1322 }}
1323 });
1324 let out = apply_bounds("crw_search", &json!({}), grouped);
1325 assert_eq!(out["data"]["results"]["web"][0]["truncated"], json!(true));
1326 assert!(out["data"]["results"]["news"][0].get("truncated").is_none());
1327 }
1328}