1use anyhow::{Context, Result};
2use serde_json::Value;
3
4use super::Tool;
5
6const MAX_CONTENT: usize = 50_000;
7
8pub struct WebFetchTool;
9
10impl Tool for WebFetchTool {
11 fn name(&self) -> &str {
12 "webfetch"
13 }
14
15 fn description(&self) -> &str {
16 "Fetch content from a URL and return it as text. Automatically strips HTML tags for web pages."
17 }
18
19 fn input_schema(&self) -> Value {
20 serde_json::json!({
21 "type": "object",
22 "properties": {
23 "url": {
24 "type": "string",
25 "description": "The URL to fetch content from"
26 }
27 },
28 "required": ["url"]
29 })
30 }
31
32 fn execute(&self, input: Value) -> Result<String> {
33 let url = input["url"]
34 .as_str()
35 .context("Missing required parameter 'url'")?;
36 tracing::debug!("webfetch: {}", url);
37
38 let response =
39 reqwest::blocking::get(url).with_context(|| format!("failed to fetch: {}", url))?;
40
41 let status = response.status();
42 if !status.is_success() {
43 anyhow::bail!("HTTP {}: {}", status.as_u16(), url);
44 }
45
46 let content_type = response
47 .headers()
48 .get("content-type")
49 .and_then(|v| v.to_str().ok())
50 .unwrap_or("")
51 .to_string();
52
53 let body = response
54 .text()
55 .with_context(|| format!("failed to read response from: {}", url))?;
56
57 let text = if content_type.contains("text/html") {
58 strip_html(&body)
59 } else {
60 body
61 };
62
63 if text.len() > MAX_CONTENT {
64 Ok(format!(
65 "{}\n... (truncated at {} chars)",
66 &text[..MAX_CONTENT],
67 MAX_CONTENT
68 ))
69 } else {
70 Ok(text)
71 }
72 }
73}
74
75fn strip_html(html: &str) -> String {
76 let mut result = String::with_capacity(html.len() / 3);
77 let mut in_tag = false;
78 let mut in_script = false;
79 let mut in_style = false;
80 let mut last_was_space = false;
81
82 let lower = html.to_lowercase();
83 let chars: Vec<char> = html.chars().collect();
84 let lower_chars: Vec<char> = lower.chars().collect();
85 let len = chars.len();
86 let mut i = 0;
87
88 while i < len {
89 if !in_tag && chars[i] == '<' {
90 let remaining: String = lower_chars[i..].iter().take(10).collect();
91 if remaining.starts_with("<script") {
92 in_script = true;
93 } else if remaining.starts_with("<style") {
94 in_style = true;
95 }
96 if remaining.starts_with("</script") {
97 in_script = false;
98 } else if remaining.starts_with("</style") {
99 in_style = false;
100 }
101
102 let tag: String = lower_chars[i..].iter().take(5).collect();
103 if tag.starts_with("<br")
104 || tag.starts_with("<p")
105 || tag.starts_with("<div")
106 || tag.starts_with("<h")
107 || tag.starts_with("<li")
108 || tag.starts_with("<tr")
109 {
110 if !result.ends_with('\n') {
111 result.push('\n');
112 }
113 }
114
115 in_tag = true;
116 i += 1;
117 continue;
118 }
119
120 if in_tag {
121 if chars[i] == '>' {
122 in_tag = false;
123 }
124 i += 1;
125 continue;
126 }
127
128 if in_script || in_style {
129 i += 1;
130 continue;
131 }
132
133 if chars[i] == '&' {
134 if let Some(semi) = html[i..].find(';') {
135 let entity = &html[i..i + semi + 1];
136 let decoded = match entity {
137 "&" => "&",
138 "<" => "<",
139 ">" => ">",
140 """ => "\"",
141 "'" => "'",
142 " " => " ",
143 _ => " ",
144 };
145 result.push_str(decoded);
146 last_was_space = decoded == " ";
147 i += semi + 1;
148 continue;
149 }
150 }
151
152 if chars[i].is_whitespace() {
153 if !last_was_space && !result.is_empty() {
154 result.push(' ');
155 last_was_space = true;
156 }
157 } else {
158 result.push(chars[i]);
159 last_was_space = false;
160 }
161
162 i += 1;
163 }
164
165 let mut cleaned = String::new();
166 let mut consecutive = 0;
167 for c in result.chars() {
168 if c == '\n' {
169 consecutive += 1;
170 if consecutive <= 2 {
171 cleaned.push(c);
172 }
173 } else {
174 consecutive = 0;
175 cleaned.push(c);
176 }
177 }
178
179 cleaned.trim().to_string()
180}