Skip to main content

atd_tools_web/
fetch.rs

1//! `ref:web.fetch` — HTTP GET with SSRF guard, size/time caps, and content-type-aware body shaping.
2
3use std::collections::HashSet;
4use std::net::{IpAddr, ToSocketAddrs};
5use std::sync::OnceLock;
6use std::time::{Duration, Instant};
7
8use atd_protocol::{
9    BindingProtocol, SafetyLevel, ToolBinding, ToolCapability, ToolDefinition, ToolResources,
10    ToolSafety, ToolTrust, ToolVisibility, TrustLevel,
11};
12use reqwest::header::{HeaderMap, HeaderName, HeaderValue};
13use url::Url;
14
15use atd_runtime::context::CallContext;
16use atd_runtime::error::ToolCallError;
17use atd_runtime::registry::{CallFuture, Tool};
18
19static DEFINITION: OnceLock<ToolDefinition> = OnceLock::new();
20
21const DEFAULT_MAX_BYTES: usize = 10_000_000;
22const DEFAULT_TIMEOUT_MS: u64 = 30_000;
23const MAX_TIMEOUT_MS: u64 = 120_000;
24const MAX_REDIRECTS: usize = 5;
25const MAX_URL_BYTES: usize = 2048;
26const DEFAULT_UA: &str = "atd-ref-server/0.1 (+https://atd-protocol.org)";
27
28fn allowed_headers() -> &'static HashSet<&'static str> {
29    static SET: OnceLock<HashSet<&'static str>> = OnceLock::new();
30    SET.get_or_init(|| {
31        let mut s = HashSet::new();
32        s.insert("accept");
33        s.insert("accept-language");
34        s.insert("referer");
35        s.insert("user-agent");
36        s
37    })
38}
39
40fn definition() -> &'static ToolDefinition {
41    DEFINITION.get_or_init(|| ToolDefinition {
42        id: "ref:web.fetch".into(),
43        name: "Web Fetch".into(),
44        description: "HTTP GET a URL and return the body. HTML is converted to markdown; JSON/plain-text are returned verbatim; binary responses return metadata only. Enforces SSRF guard (blocks private/loopback IPs by default), size cap (default 10 MiB), timeout (default 30s, max 120s), and a 5-redirect cap. Request headers are restricted to an allowlist (accept, accept-language, referer, user-agent).".into(),
45        version: "0.1.0".into(),
46        capability: ToolCapability {
47            domain: "web".into(),
48            actions: vec!["fetch".into()],
49            tags: vec!["web".into(), "http".into(), "fetch".into()],
50            intent_examples: vec![
51                "fetch https://example.com".into(),
52                "read the README at https://example.com/repo/readme.md".into(),
53            ],
54        },
55        input_schema: serde_json::json!({
56            "type": "object",
57            "properties": {
58                "url":           { "type": "string",  "minLength": 1, "maxLength": 2048 },
59                "headers":       { "type": "object",  "additionalProperties": { "type": "string" } },
60                "max_bytes":     { "type": "integer", "minimum": 1 },
61                "timeout_ms":    { "type": "integer", "minimum": 1 },
62                "allow_private": { "type": "boolean" }
63            },
64            "required": ["url"]
65        }),
66        output_schema: serde_json::json!({
67            "type": "object",
68            "properties": {
69                "url":             { "type": "string" },
70                "status":          { "type": "integer" },
71                "content_type":    { "type": "string" },
72                "content":         { "type": "string" },
73                "content_length":  { "type": "integer" },
74                "truncated":       { "type": "boolean" },
75                "binary":          { "type": "boolean" },
76                "redirected_from": { "type": "array", "items": { "type": "string" } },
77                "duration_ms":     { "type": "integer" }
78            }
79        }),
80        bindings: vec![ToolBinding {
81            protocol: BindingProtocol::Cli,
82            config: serde_json::json!({}),
83        }],
84        safety: ToolSafety {
85            level: SafetyLevel::Read,
86            dry_run: false,
87            side_effects: vec!["network:outbound".into()],
88            data_sensitivity: Some(
89                "URL fingerprint + source IP visible to the target server".into(),
90            ),
91        },
92        resources: ToolResources {
93            timeout_ms: MAX_TIMEOUT_MS,
94            max_concurrent: 10,
95            rate_limit_per_min: None,
96            estimated_tokens: Some(800),
97        },
98        trust: ToolTrust {
99            publisher: "atd-ref-server".into(),
100            trust_level: TrustLevel::L2Tested,
101            signature: None,
102        },
103        visibility: ToolVisibility::Read,
104        required_capabilities: vec![],
105        tier: None,
106        errors: vec![],
107    })
108}
109
110pub struct WebFetchTool;
111
112impl WebFetchTool {
113    pub fn new() -> Self {
114        Self
115    }
116}
117
118impl Default for WebFetchTool {
119    fn default() -> Self {
120        Self::new()
121    }
122}
123
124#[derive(serde::Deserialize)]
125struct FetchArgs {
126    url: String,
127    #[serde(default)]
128    headers: Option<serde_json::Map<String, serde_json::Value>>,
129    #[serde(default)]
130    max_bytes: Option<usize>,
131    #[serde(default)]
132    timeout_ms: Option<u64>,
133    #[serde(default)]
134    allow_private: Option<bool>,
135}
136
137fn ip_is_private(ip: &IpAddr) -> bool {
138    match ip {
139        IpAddr::V4(v4) => {
140            let o = v4.octets();
141            v4.is_loopback()
142                || v4.is_link_local()
143                || v4.is_private()
144                || v4.is_broadcast()
145                || v4.is_unspecified()
146                || v4.is_multicast()
147                // 0.0.0.0/8 — current network (catches 0.x.y.z, not just 0.0.0.0)
148                || o[0] == 0
149                // 100.64.0.0/10 — Carrier-Grade NAT
150                || (o[0] == 100 && (o[1] & 0xC0) == 64)
151                // 192.0.0.0/24 — IETF protocol assignments
152                || (o[0] == 192 && o[1] == 0 && o[2] == 0)
153                // 192.0.2.0/24 — TEST-NET-1
154                || (o[0] == 192 && o[1] == 0 && o[2] == 2)
155                // 198.18.0.0/15 — Benchmarking
156                || (o[0] == 198 && (o[1] & 0xFE) == 18)
157                // 198.51.100.0/24 — TEST-NET-2
158                || (o[0] == 198 && o[1] == 51 && o[2] == 100)
159                // 203.0.113.0/24 — TEST-NET-3
160                || (o[0] == 203 && o[1] == 0 && o[2] == 113)
161        }
162        IpAddr::V6(v6) => {
163            v6.is_loopback()
164                || v6.is_unspecified()
165                || v6.is_multicast()
166                // Link-local fe80::/10
167                || (v6.segments()[0] & 0xffc0) == 0xfe80
168                // ULA fc00::/7
169                || (v6.segments()[0] & 0xfe00) == 0xfc00
170                // IPv4-mapped: check the embedded v4 for privacy
171                || v6
172                    .to_ipv4_mapped()
173                    .map(|v4| ip_is_private(&IpAddr::V4(v4)))
174                    .unwrap_or(false)
175        }
176    }
177}
178
179fn check_ssrf(url: &Url, allow_private: bool) -> Result<(), ToolCallError> {
180    if allow_private {
181        return Ok(());
182    }
183    let host = url
184        .host_str()
185        .ok_or_else(|| ToolCallError::InvalidArgs("URL has no host".into()))?;
186    // If the host is a literal IP, parse directly.
187    if let Ok(ip) = host.parse::<IpAddr>() {
188        if ip_is_private(&ip) {
189            return Err(ToolCallError::ExecutionFailed {
190                code: "PRIVATE_ADDRESS_BLOCKED".into(),
191                message: format!("{ip} is a private/loopback/link-local address"),
192                retryable: false,
193            });
194        }
195        return Ok(());
196    }
197    // DNS resolve. Port doesn't matter for IP classification; use a dummy.
198    let port = url.port_or_known_default().unwrap_or(80);
199    // SECURITY NOTE: DNS rebinding is not prevented here. We resolve once and
200    // check each IP, but an adversary controlling DNS could return a public IP
201    // for this lookup and then switch the record to a private IP for reqwest's
202    // own connect-time resolution. Accepted trade-off for MVP. Phase 2 fix
203    // requires binding reqwest to a custom resolver that shares this result
204    // OR adding a connect-time firewall layer.
205    let mut addrs = match (host, port).to_socket_addrs() {
206        Ok(it) => it.peekable(),
207        Err(e) => {
208            return Err(ToolCallError::ExecutionFailed {
209                code: "DNS_FAILED".into(),
210                message: format!("dns lookup failed for {host}: {e}"),
211                retryable: true,
212            });
213        }
214    };
215    if addrs.peek().is_none() {
216        return Err(ToolCallError::ExecutionFailed {
217            code: "DNS_FAILED".into(),
218            message: format!("no addresses resolved for {host}"),
219            retryable: true,
220        });
221    }
222    for sa in addrs {
223        let ip = sa.ip();
224        if ip_is_private(&ip) {
225            return Err(ToolCallError::ExecutionFailed {
226                code: "PRIVATE_ADDRESS_BLOCKED".into(),
227                message: format!("{host} resolves to private address {ip}"),
228                retryable: false,
229            });
230        }
231    }
232    Ok(())
233}
234
235fn build_headers(
236    input: Option<&serde_json::Map<String, serde_json::Value>>,
237) -> Result<HeaderMap, ToolCallError> {
238    let mut hm = HeaderMap::new();
239    let Some(map) = input else {
240        return Ok(hm);
241    };
242    let allowed = allowed_headers();
243    for (k, v) in map.iter() {
244        let lower = k.to_lowercase();
245        if !allowed.contains(lower.as_str()) {
246            return Err(ToolCallError::InvalidArgs(format!(
247                "header `{k}` is not in the allowlist (allowed: accept, accept-language, referer, user-agent)"
248            )));
249        }
250        let name = HeaderName::from_bytes(lower.as_bytes())
251            .map_err(|e| ToolCallError::InvalidArgs(format!("bad header name `{k}`: {e}")))?;
252        let Some(s) = v.as_str() else {
253            return Err(ToolCallError::InvalidArgs(format!(
254                "header `{k}` must be a string"
255            )));
256        };
257        let val = HeaderValue::from_str(s)
258            .map_err(|e| ToolCallError::InvalidArgs(format!("bad header value for `{k}`: {e}")))?;
259        hm.insert(name, val);
260    }
261    Ok(hm)
262}
263
264fn classify_content_type(ct: &str) -> ContentKind {
265    let lc = ct.to_ascii_lowercase();
266    let base = lc.split(';').next().unwrap_or("").trim();
267    if base == "text/html" || base == "application/xhtml+xml" {
268        return ContentKind::Html;
269    }
270    if base == "application/json"
271        || base == "application/xml"
272        || base == "application/javascript"
273        || base.starts_with("text/")
274    {
275        return ContentKind::Text;
276    }
277    ContentKind::Binary
278}
279
280enum ContentKind {
281    Html,
282    Text,
283    Binary,
284}
285
286/// Convert HTML to markdown, stripping script and style tag contents so
287/// agents aren't fed JavaScript source or CSS as body text.
288fn html_to_markdown(html: &str) -> String {
289    use htmd::HtmlToMarkdown;
290    let converter = HtmlToMarkdown::builder()
291        .skip_tags(vec!["script", "style"])
292        .build();
293    converter.convert(html).unwrap_or_default()
294}
295
296/// Stream-read bytes up to `cap`. Returns `(bytes, truncated)`.
297async fn read_body_capped(
298    mut response: reqwest::Response,
299    cap: usize,
300) -> Result<(Vec<u8>, bool), reqwest::Error> {
301    let mut buf: Vec<u8> = Vec::new();
302    let mut truncated = false;
303    while let Some(chunk) = response.chunk().await? {
304        if buf.len() >= cap {
305            // Cap already reached. Drop the Response (via function return) to
306            // close the connection — reqwest will RST, and we don't need any
307            // further bytes. Drain-past-cap logic (useful for subprocess pipes)
308            // would be a DoS vector here: a slow-drip server could hold our
309            // reqwest connection open for the full timeout window.
310            truncated = true;
311            break;
312        }
313        let room = cap - buf.len();
314        if chunk.len() <= room {
315            buf.extend_from_slice(&chunk);
316        } else {
317            buf.extend_from_slice(&chunk[..room]);
318            truncated = true;
319            break;
320        }
321    }
322    Ok((buf, truncated))
323}
324
325impl Tool for WebFetchTool {
326    fn definition(&self) -> &ToolDefinition {
327        definition()
328    }
329
330    fn call<'a>(&'a self, args: serde_json::Value, ctx: &'a CallContext) -> CallFuture<'a> {
331        Box::pin(async move {
332            let args: FetchArgs = serde_json::from_value(args)
333                .map_err(|e| ToolCallError::InvalidArgs(e.to_string()))?;
334            if args.url.trim().is_empty() {
335                return Err(ToolCallError::InvalidArgs(
336                    "url is empty or whitespace-only".into(),
337                ));
338            }
339            if args.url.len() > MAX_URL_BYTES {
340                return Err(ToolCallError::InvalidArgs(format!(
341                    "url exceeds {MAX_URL_BYTES} bytes"
342                )));
343            }
344            let parsed = Url::parse(&args.url)
345                .map_err(|e| ToolCallError::InvalidArgs(format!("invalid URL: {e}")))?;
346            match parsed.scheme() {
347                "http" | "https" => {}
348                other => {
349                    return Err(ToolCallError::InvalidArgs(format!(
350                        "only http/https URLs are supported; got {other}"
351                    )));
352                }
353            }
354            let headers = build_headers(args.headers.as_ref())?;
355            let allow_private = args.allow_private.unwrap_or(false);
356            check_ssrf(&parsed, allow_private)?;
357
358            let max_bytes = args
359                .max_bytes
360                .unwrap_or(DEFAULT_MAX_BYTES)
361                .clamp(1, ctx.max_output_bytes);
362            let timeout_ms = args
363                .timeout_ms
364                .unwrap_or(DEFAULT_TIMEOUT_MS)
365                .clamp(1, MAX_TIMEOUT_MS);
366
367            let redirect_chain: std::sync::Arc<std::sync::Mutex<Vec<String>>> =
368                std::sync::Arc::new(std::sync::Mutex::new(Vec::new()));
369            let chain_for_policy = redirect_chain.clone();
370            let allow_private_for_policy = allow_private;
371
372            let redirect_policy = reqwest::redirect::Policy::custom(move |attempt| {
373                // Record the previous URL (where we came from).
374                if let Some(prev) = attempt.previous().last() {
375                    if let Ok(mut chain) = chain_for_policy.lock() {
376                        chain.push(prev.to_string());
377                    }
378                }
379                if attempt.previous().len() >= MAX_REDIRECTS {
380                    return attempt.error("too many redirects");
381                }
382                // Re-run the SSRF check on each redirect destination.
383                if let Err(e) = check_ssrf(attempt.url(), allow_private_for_policy) {
384                    return attempt.error(format!("redirect blocked: {e:?}"));
385                }
386                attempt.follow()
387            });
388
389            let client = reqwest::Client::builder()
390                .redirect(redirect_policy)
391                .timeout(Duration::from_millis(timeout_ms))
392                .user_agent(DEFAULT_UA)
393                .build()
394                .map_err(|e| ToolCallError::ExecutionFailed {
395                    code: "IO".into(),
396                    message: format!("client build failed: {e}"),
397                    retryable: false,
398                })?;
399
400            let start = Instant::now();
401            let resp = client
402                .get(parsed.clone())
403                .headers(headers)
404                .send()
405                .await
406                .map_err(map_reqwest_error)?;
407
408            let final_url = resp.url().to_string();
409            let status = resp.status().as_u16();
410            let content_type = resp
411                .headers()
412                .get(reqwest::header::CONTENT_TYPE)
413                .and_then(|v| v.to_str().ok())
414                .unwrap_or("")
415                .to_string();
416
417            let (body_bytes, truncated) = read_body_capped(resp, max_bytes)
418                .await
419                .map_err(map_reqwest_error)?;
420            let content_length = body_bytes.len();
421            let kind = classify_content_type(&content_type);
422            let (content, binary) = match kind {
423                ContentKind::Html => {
424                    let text = String::from_utf8_lossy(&body_bytes).into_owned();
425                    let md = html_to_markdown(&text);
426                    (md, false)
427                }
428                ContentKind::Text => (String::from_utf8_lossy(&body_bytes).into_owned(), false),
429                ContentKind::Binary => (String::new(), true),
430            };
431            let duration_ms = start.elapsed().as_millis() as u64;
432
433            Ok(serde_json::json!({
434                "url": final_url,
435                "status": status,
436                "content_type": content_type,
437                "content": content,
438                "content_length": content_length,
439                "truncated": truncated,
440                "binary": binary,
441                "redirected_from": redirect_chain.lock()
442                    .map(|v| serde_json::Value::Array(
443                        v.iter().map(|s| serde_json::Value::String(s.clone())).collect()
444                    ))
445                    .unwrap_or_else(|_| serde_json::Value::Array(vec![])),
446                "duration_ms": duration_ms,
447            }))
448        })
449    }
450}
451
452fn map_reqwest_error(e: reqwest::Error) -> ToolCallError {
453    if e.is_timeout() {
454        ToolCallError::ExecutionFailed {
455            code: "TIMEOUT".into(),
456            message: format!("{e}"),
457            retryable: true,
458        }
459    } else if e.is_redirect() {
460        ToolCallError::ExecutionFailed {
461            code: "TOO_MANY_REDIRECTS".into(),
462            message: format!("{e}"),
463            retryable: false,
464        }
465    } else if e.is_connect() {
466        let msg = format!("{e}");
467        // reqwest 0.12 does not expose a stable TLS-specific error variant.
468        // We infer TLS failure from the formatted error string. This is fragile
469        // but is the best available option without vendoring hyper-rustls or
470        // mapping from private reqwest types.
471        let code =
472            if msg.to_lowercase().contains("tls") || msg.to_lowercase().contains("certificate") {
473                "TLS_FAILED"
474            } else {
475                "IO"
476            };
477        ToolCallError::ExecutionFailed {
478            code: code.into(),
479            message: msg,
480            retryable: code == "IO",
481        }
482    } else {
483        ToolCallError::ExecutionFailed {
484            code: "IO".into(),
485            message: format!("{e}"),
486            retryable: true,
487        }
488    }
489}
490
491#[cfg(test)]
492mod tests {
493    use super::*;
494    use std::sync::Arc;
495    use tokio::io::{AsyncReadExt, AsyncWriteExt};
496    use tokio::net::TcpListener;
497
498    /// Helper: spawn a one-shot HTTP server that returns the given response
499    /// bytes verbatim for a single connection. Returns the bound port.
500    async fn spawn_oneshot(response: Vec<u8>) -> u16 {
501        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
502        let port = listener.local_addr().unwrap().port();
503        tokio::spawn(async move {
504            if let Ok((mut sock, _)) = listener.accept().await {
505                // Drain the request (don't care about it).
506                let mut buf = [0u8; 4096];
507                let _ = sock.read(&mut buf).await;
508                let _ = sock.write_all(&response).await;
509                let _ = sock.shutdown().await;
510            }
511        });
512        port
513    }
514
515    /// Helper that returns both the port AND a shared buffer that captures
516    /// the raw request. Useful for header-echo assertions.
517    async fn spawn_capturing(response: Vec<u8>) -> (u16, Arc<tokio::sync::Mutex<Vec<u8>>>) {
518        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
519        let port = listener.local_addr().unwrap().port();
520        let buf = Arc::new(tokio::sync::Mutex::new(Vec::new()));
521        let buf2 = buf.clone();
522        tokio::spawn(async move {
523            if let Ok((mut sock, _)) = listener.accept().await {
524                let mut chunk = [0u8; 4096];
525                let mut guard = buf2.lock().await;
526                // Read until we see a blank line (end of headers).
527                loop {
528                    match sock.read(&mut chunk).await {
529                        Ok(0) => break,
530                        Ok(n) => {
531                            guard.extend_from_slice(&chunk[..n]);
532                            if guard.windows(4).any(|w| w == b"\r\n\r\n") {
533                                break;
534                            }
535                        }
536                        Err(_) => break,
537                    }
538                }
539                let _ = sock.write_all(&response).await;
540                let _ = sock.shutdown().await;
541            }
542        });
543        (port, buf)
544    }
545
546    fn http_ok(ctype: &str, body: &[u8]) -> Vec<u8> {
547        let mut v = Vec::new();
548        v.extend_from_slice(b"HTTP/1.1 200 OK\r\n");
549        v.extend_from_slice(format!("Content-Type: {ctype}\r\n").as_bytes());
550        v.extend_from_slice(format!("Content-Length: {}\r\n", body.len()).as_bytes());
551        v.extend_from_slice(b"Connection: close\r\n\r\n");
552        v.extend_from_slice(body);
553        v
554    }
555
556    #[tokio::test]
557    async fn rejects_non_http_scheme() {
558        let t = WebFetchTool::new();
559        let ctx = CallContext::for_test();
560        let err = t
561            .call(serde_json::json!({"url": "file:///etc/passwd"}), &ctx)
562            .await
563            .unwrap_err();
564        assert!(matches!(err, ToolCallError::InvalidArgs(_)));
565    }
566
567    #[tokio::test]
568    async fn rejects_private_ip_by_default() {
569        let t = WebFetchTool::new();
570        let ctx = CallContext::for_test();
571        let err = t
572            .call(serde_json::json!({"url": "http://127.0.0.1:9"}), &ctx)
573            .await
574            .unwrap_err();
575        match err {
576            ToolCallError::ExecutionFailed { code, .. } => {
577                assert_eq!(code, "PRIVATE_ADDRESS_BLOCKED");
578            }
579            _ => panic!("expected PRIVATE_ADDRESS_BLOCKED"),
580        }
581    }
582
583    #[tokio::test]
584    async fn allows_private_with_flag() {
585        let body = b"<html><body><h1>Hi</h1></body></html>";
586        let port = spawn_oneshot(http_ok("text/html; charset=utf-8", body)).await;
587        let t = WebFetchTool::new();
588        let ctx = CallContext::for_test();
589        let r = t
590            .call(
591                serde_json::json!({
592                    "url": format!("http://127.0.0.1:{port}/"),
593                    "allow_private": true
594                }),
595                &ctx,
596            )
597            .await
598            .unwrap();
599        assert_eq!(r["status"], 200);
600        assert_eq!(r["binary"], false);
601        let content = r["content"].as_str().unwrap();
602        assert!(
603            content.contains("Hi"),
604            "markdown should contain 'Hi': {content:?}"
605        );
606    }
607
608    #[tokio::test]
609    async fn rejects_disallowed_request_header() {
610        let t = WebFetchTool::new();
611        let ctx = CallContext::for_test();
612        let err = t
613            .call(
614                serde_json::json!({
615                    "url": "http://127.0.0.1:9",
616                    "headers": {"Authorization": "Bearer xxx"},
617                    "allow_private": true
618                }),
619                &ctx,
620            )
621            .await
622            .unwrap_err();
623        match err {
624            ToolCallError::InvalidArgs(msg) => {
625                assert!(msg.to_lowercase().contains("allowlist"));
626            }
627            _ => panic!("expected InvalidArgs, got {err:?}"),
628        }
629    }
630
631    #[tokio::test]
632    async fn accepts_allowed_request_header() {
633        let (port, captured) = spawn_capturing(http_ok("text/plain", b"ok")).await;
634        let t = WebFetchTool::new();
635        let ctx = CallContext::for_test();
636        let _ = t
637            .call(
638                serde_json::json!({
639                    "url": format!("http://127.0.0.1:{port}/"),
640                    "headers": {"Accept": "application/json"},
641                    "allow_private": true
642                }),
643                &ctx,
644            )
645            .await
646            .unwrap();
647        // Give the capturing server a moment to finish reading.
648        tokio::time::sleep(Duration::from_millis(50)).await;
649        let raw = captured.lock().await;
650        let request_str = String::from_utf8_lossy(&raw);
651        assert!(
652            request_str
653                .to_lowercase()
654                .contains("accept: application/json"),
655            "request should contain 'accept: application/json': {request_str:?}"
656        );
657    }
658
659    #[tokio::test]
660    async fn truncates_at_max_bytes() {
661        let body = vec![b'x'; 10_000];
662        let port = spawn_oneshot(http_ok("text/plain", &body)).await;
663        let t = WebFetchTool::new();
664        let ctx = CallContext::for_test();
665        let r = t
666            .call(
667                serde_json::json!({
668                    "url": format!("http://127.0.0.1:{port}/"),
669                    "max_bytes": 1024,
670                    "allow_private": true
671                }),
672                &ctx,
673            )
674            .await
675            .unwrap();
676        assert_eq!(r["truncated"], true);
677        let content = r["content"].as_str().unwrap();
678        assert!(content.len() <= 1024);
679    }
680
681    #[tokio::test]
682    async fn html_converted_to_markdown() {
683        let body = b"<html><head><script>evil()</script></head><body><h1>Title</h1></body></html>";
684        let port = spawn_oneshot(http_ok("text/html; charset=utf-8", body)).await;
685        let t = WebFetchTool::new();
686        let ctx = CallContext::for_test();
687        let r = t
688            .call(
689                serde_json::json!({
690                    "url": format!("http://127.0.0.1:{port}/"),
691                    "allow_private": true
692                }),
693                &ctx,
694            )
695            .await
696            .unwrap();
697        let content = r["content"].as_str().unwrap();
698        assert!(
699            content.contains("Title"),
700            "content should contain Title: {content:?}"
701        );
702        assert!(
703            !content.to_lowercase().contains("evil()"),
704            "script body should be stripped: {content:?}"
705        );
706    }
707
708    #[tokio::test]
709    async fn binary_content_type_emits_empty_content() {
710        let body = [0u8, 1, 2, 3, 4, 5];
711        let port = spawn_oneshot(http_ok("image/png", &body)).await;
712        let t = WebFetchTool::new();
713        let ctx = CallContext::for_test();
714        let r = t
715            .call(
716                serde_json::json!({
717                    "url": format!("http://127.0.0.1:{port}/"),
718                    "allow_private": true
719                }),
720                &ctx,
721            )
722            .await
723            .unwrap();
724        assert_eq!(r["binary"], true);
725        assert_eq!(r["content"], "");
726        assert_eq!(r["content_length"], body.len());
727    }
728
729    #[tokio::test]
730    async fn zero_octet_ip_blocked() {
731        let t = WebFetchTool::new();
732        let ctx = CallContext::for_test();
733        let err = t
734            .call(serde_json::json!({"url": "http://0.0.0.0:80"}), &ctx)
735            .await
736            .unwrap_err();
737        match err {
738            ToolCallError::ExecutionFailed { code, .. } => {
739                assert_eq!(code, "PRIVATE_ADDRESS_BLOCKED");
740            }
741            _ => panic!("expected PRIVATE_ADDRESS_BLOCKED, got {err:?}"),
742        }
743    }
744
745    #[tokio::test]
746    async fn test_net_range_blocked() {
747        let t = WebFetchTool::new();
748        let ctx = CallContext::for_test();
749        let err = t
750            .call(serde_json::json!({"url": "http://192.0.2.1:80"}), &ctx)
751            .await
752            .unwrap_err();
753        match err {
754            ToolCallError::ExecutionFailed { code, .. } => {
755                assert_eq!(code, "PRIVATE_ADDRESS_BLOCKED");
756            }
757            _ => panic!("expected PRIVATE_ADDRESS_BLOCKED, got {err:?}"),
758        }
759    }
760}