Skip to main content

dot/tools/
web.rs

1use anyhow::{Context, Result};
2use serde_json::Value;
3
4use super::Tool;
5
6const MAX_CONTENT: usize = 50_000;
7
8pub struct WebFetchTool;
9
10impl Tool for WebFetchTool {
11    fn name(&self) -> &str {
12        "webfetch"
13    }
14
15    fn description(&self) -> &str {
16        "Fetch content from a URL and return it as text. Automatically strips HTML tags for web pages."
17    }
18
19    fn input_schema(&self) -> Value {
20        serde_json::json!({
21            "type": "object",
22            "properties": {
23                "url": {
24                    "type": "string",
25                    "description": "The URL to fetch content from"
26                }
27            },
28            "required": ["url"]
29        })
30    }
31
32    fn execute(&self, input: Value) -> Result<String> {
33        let url = input["url"]
34            .as_str()
35            .context("Missing required parameter 'url'")?;
36        tracing::debug!("webfetch: {}", url);
37
38        let response =
39            reqwest::blocking::get(url).with_context(|| format!("failed to fetch: {}", url))?;
40
41        let status = response.status();
42        if !status.is_success() {
43            anyhow::bail!("HTTP {}: {}", status.as_u16(), url);
44        }
45
46        let content_type = response
47            .headers()
48            .get("content-type")
49            .and_then(|v| v.to_str().ok())
50            .unwrap_or("")
51            .to_string();
52
53        let body = response
54            .text()
55            .with_context(|| format!("failed to read response from: {}", url))?;
56
57        let text = if content_type.contains("text/html") {
58            strip_html(&body)
59        } else {
60            body
61        };
62
63        if text.len() > MAX_CONTENT {
64            Ok(format!(
65                "{}\n... (truncated at {} chars)",
66                &text[..MAX_CONTENT],
67                MAX_CONTENT
68            ))
69        } else {
70            Ok(text)
71        }
72    }
73}
74
75fn strip_html(html: &str) -> String {
76    let mut result = String::with_capacity(html.len() / 3);
77    let mut in_tag = false;
78    let mut in_script = false;
79    let mut in_style = false;
80    let mut last_was_space = false;
81
82    let lower = html.to_lowercase();
83    let chars: Vec<char> = html.chars().collect();
84    let lower_chars: Vec<char> = lower.chars().collect();
85    let len = chars.len();
86    let mut i = 0;
87
88    while i < len {
89        if !in_tag && chars[i] == '<' {
90            let remaining: String = lower_chars[i..].iter().take(10).collect();
91            if remaining.starts_with("<script") {
92                in_script = true;
93            } else if remaining.starts_with("<style") {
94                in_style = true;
95            }
96            if remaining.starts_with("</script") {
97                in_script = false;
98            } else if remaining.starts_with("</style") {
99                in_style = false;
100            }
101
102            let tag: String = lower_chars[i..].iter().take(5).collect();
103            if (tag.starts_with("<br")
104                || tag.starts_with("<p")
105                || tag.starts_with("<div")
106                || tag.starts_with("<h")
107                || tag.starts_with("<li")
108                || tag.starts_with("<tr"))
109                && !result.ends_with('\n')
110            {
111                result.push('\n');
112            }
113
114            in_tag = true;
115            i += 1;
116            continue;
117        }
118
119        if in_tag {
120            if chars[i] == '>' {
121                in_tag = false;
122            }
123            i += 1;
124            continue;
125        }
126
127        if in_script || in_style {
128            i += 1;
129            continue;
130        }
131
132        if chars[i] == '&'
133            && let Some(semi) = html[i..].find(';')
134        {
135            let entity = &html[i..i + semi + 1];
136            let decoded = match entity {
137                "&amp;" => "&",
138                "&lt;" => "<",
139                "&gt;" => ">",
140                "&quot;" => "\"",
141                "&apos;" => "'",
142                "&nbsp;" => " ",
143                _ => " ",
144            };
145            result.push_str(decoded);
146            last_was_space = decoded == " ";
147            i += semi + 1;
148            continue;
149        }
150
151        if chars[i].is_whitespace() {
152            if !last_was_space && !result.is_empty() {
153                result.push(' ');
154                last_was_space = true;
155            }
156        } else {
157            result.push(chars[i]);
158            last_was_space = false;
159        }
160
161        i += 1;
162    }
163
164    let mut cleaned = String::new();
165    let mut consecutive = 0;
166    for c in result.chars() {
167        if c == '\n' {
168            consecutive += 1;
169            if consecutive <= 2 {
170                cleaned.push(c);
171            }
172        } else {
173            consecutive = 0;
174            cleaned.push(c);
175        }
176    }
177
178    cleaned.trim().to_string()
179}