dot/tools/
web.rs

1use anyhow::{Context, Result};
2use serde_json::Value;
3
4use super::Tool;
5
6const MAX_CONTENT: usize = 50_000;
7
8pub struct WebFetchTool;
9
10impl Tool for WebFetchTool {
11    fn name(&self) -> &str {
12        "webfetch"
13    }
14
15    fn description(&self) -> &str {
16        "Fetch content from a URL and return it as text. Automatically strips HTML tags for web pages."
17    }
18
19    fn input_schema(&self) -> Value {
20        serde_json::json!({
21            "type": "object",
22            "properties": {
23                "url": {
24                    "type": "string",
25                    "description": "The URL to fetch content from"
26                }
27            },
28            "required": ["url"]
29        })
30    }
31
32    fn execute(&self, input: Value) -> Result<String> {
33        let url = input["url"]
34            .as_str()
35            .context("Missing required parameter 'url'")?;
36        tracing::debug!("webfetch: {}", url);
37
38        let response =
39            reqwest::blocking::get(url).with_context(|| format!("failed to fetch: {}", url))?;
40
41        let status = response.status();
42        if !status.is_success() {
43            anyhow::bail!("HTTP {}: {}", status.as_u16(), url);
44        }
45
46        let content_type = response
47            .headers()
48            .get("content-type")
49            .and_then(|v| v.to_str().ok())
50            .unwrap_or("")
51            .to_string();
52
53        let body = response
54            .text()
55            .with_context(|| format!("failed to read response from: {}", url))?;
56
57        let text = if content_type.contains("text/html") {
58            strip_html(&body)
59        } else {
60            body
61        };
62
63        if text.len() > MAX_CONTENT {
64            Ok(format!(
65                "{}\n... (truncated at {} chars)",
66                &text[..MAX_CONTENT],
67                MAX_CONTENT
68            ))
69        } else {
70            Ok(text)
71        }
72    }
73}
74
75fn strip_html(html: &str) -> String {
76    let mut result = String::with_capacity(html.len() / 3);
77    let mut in_tag = false;
78    let mut in_script = false;
79    let mut in_style = false;
80    let mut last_was_space = false;
81
82    let lower = html.to_lowercase();
83    let chars: Vec<char> = html.chars().collect();
84    let lower_chars: Vec<char> = lower.chars().collect();
85    let len = chars.len();
86    let mut i = 0;
87
88    while i < len {
89        if !in_tag && chars[i] == '<' {
90            let remaining: String = lower_chars[i..].iter().take(10).collect();
91            if remaining.starts_with("<script") {
92                in_script = true;
93            } else if remaining.starts_with("<style") {
94                in_style = true;
95            }
96            if remaining.starts_with("</script") {
97                in_script = false;
98            } else if remaining.starts_with("</style") {
99                in_style = false;
100            }
101
102            let tag: String = lower_chars[i..].iter().take(5).collect();
103            if tag.starts_with("<br")
104                || tag.starts_with("<p")
105                || tag.starts_with("<div")
106                || tag.starts_with("<h")
107                || tag.starts_with("<li")
108                || tag.starts_with("<tr")
109            {
110                if !result.ends_with('\n') {
111                    result.push('\n');
112                }
113            }
114
115            in_tag = true;
116            i += 1;
117            continue;
118        }
119
120        if in_tag {
121            if chars[i] == '>' {
122                in_tag = false;
123            }
124            i += 1;
125            continue;
126        }
127
128        if in_script || in_style {
129            i += 1;
130            continue;
131        }
132
133        if chars[i] == '&' {
134            if let Some(semi) = html[i..].find(';') {
135                let entity = &html[i..i + semi + 1];
136                let decoded = match entity {
137                    "&amp;" => "&",
138                    "&lt;" => "<",
139                    "&gt;" => ">",
140                    "&quot;" => "\"",
141                    "&apos;" => "'",
142                    "&nbsp;" => " ",
143                    _ => " ",
144                };
145                result.push_str(decoded);
146                last_was_space = decoded == " ";
147                i += semi + 1;
148                continue;
149            }
150        }
151
152        if chars[i].is_whitespace() {
153            if !last_was_space && !result.is_empty() {
154                result.push(' ');
155                last_was_space = true;
156            }
157        } else {
158            result.push(chars[i]);
159            last_was_space = false;
160        }
161
162        i += 1;
163    }
164
165    let mut cleaned = String::new();
166    let mut consecutive = 0;
167    for c in result.chars() {
168        if c == '\n' {
169            consecutive += 1;
170            if consecutive <= 2 {
171                cleaned.push(c);
172            }
173        } else {
174            consecutive = 0;
175            cleaned.push(c);
176        }
177    }
178
179    cleaned.trim().to_string()
180}
dot/tools/web.rs

dot/tools/
web.rs