1use anyhow::{Context, Result};
2use serde_json::Value;
3
4use super::Tool;
5
6const MAX_CONTENT: usize = 50_000;
7
8pub struct WebFetchTool;
9
10impl Tool for WebFetchTool {
11 fn name(&self) -> &str {
12 "webfetch"
13 }
14
15 fn description(&self) -> &str {
16 "Fetch content from a URL and return it as text. Automatically strips HTML tags for web pages."
17 }
18
19 fn input_schema(&self) -> Value {
20 serde_json::json!({
21 "type": "object",
22 "properties": {
23 "url": {
24 "type": "string",
25 "description": "The URL to fetch content from"
26 }
27 },
28 "required": ["url"]
29 })
30 }
31
32 fn execute(&self, input: Value) -> Result<String> {
33 let url = input["url"]
34 .as_str()
35 .context("Missing required parameter 'url'")?;
36 tracing::debug!("webfetch: {}", url);
37
38 let response =
39 reqwest::blocking::get(url).with_context(|| format!("failed to fetch: {}", url))?;
40
41 let status = response.status();
42 if !status.is_success() {
43 anyhow::bail!("HTTP {}: {}", status.as_u16(), url);
44 }
45
46 let content_type = response
47 .headers()
48 .get("content-type")
49 .and_then(|v| v.to_str().ok())
50 .unwrap_or("")
51 .to_string();
52
53 let body = response
54 .text()
55 .with_context(|| format!("failed to read response from: {}", url))?;
56
57 let text = if content_type.contains("text/html") {
58 strip_html(&body)
59 } else {
60 body
61 };
62
63 if text.len() > MAX_CONTENT {
64 Ok(format!(
65 "{}\n... (truncated at {} chars)",
66 &text[..MAX_CONTENT],
67 MAX_CONTENT
68 ))
69 } else {
70 Ok(text)
71 }
72 }
73}
74
75fn strip_html(html: &str) -> String {
76 let mut result = String::with_capacity(html.len() / 3);
77 let mut in_tag = false;
78 let mut in_script = false;
79 let mut in_style = false;
80 let mut last_was_space = false;
81
82 let lower = html.to_lowercase();
83 let chars: Vec<char> = html.chars().collect();
84 let lower_chars: Vec<char> = lower.chars().collect();
85 let len = chars.len();
86 let mut i = 0;
87
88 while i < len {
89 if !in_tag && chars[i] == '<' {
90 let remaining: String = lower_chars[i..].iter().take(10).collect();
91 if remaining.starts_with("<script") {
92 in_script = true;
93 } else if remaining.starts_with("<style") {
94 in_style = true;
95 }
96 if remaining.starts_with("</script") {
97 in_script = false;
98 } else if remaining.starts_with("</style") {
99 in_style = false;
100 }
101
102 let tag: String = lower_chars[i..].iter().take(5).collect();
103 if (tag.starts_with("<br")
104 || tag.starts_with("<p")
105 || tag.starts_with("<div")
106 || tag.starts_with("<h")
107 || tag.starts_with("<li")
108 || tag.starts_with("<tr"))
109 && !result.ends_with('\n')
110 {
111 result.push('\n');
112 }
113
114 in_tag = true;
115 i += 1;
116 continue;
117 }
118
119 if in_tag {
120 if chars[i] == '>' {
121 in_tag = false;
122 }
123 i += 1;
124 continue;
125 }
126
127 if in_script || in_style {
128 i += 1;
129 continue;
130 }
131
132 if chars[i] == '&'
133 && let Some(semi) = html[i..].find(';')
134 {
135 let entity = &html[i..i + semi + 1];
136 let decoded = match entity {
137 "&" => "&",
138 "<" => "<",
139 ">" => ">",
140 """ => "\"",
141 "'" => "'",
142 " " => " ",
143 _ => " ",
144 };
145 result.push_str(decoded);
146 last_was_space = decoded == " ";
147 i += semi + 1;
148 continue;
149 }
150
151 if chars[i].is_whitespace() {
152 if !last_was_space && !result.is_empty() {
153 result.push(' ');
154 last_was_space = true;
155 }
156 } else {
157 result.push(chars[i]);
158 last_was_space = false;
159 }
160
161 i += 1;
162 }
163
164 let mut cleaned = String::new();
165 let mut consecutive = 0;
166 for c in result.chars() {
167 if c == '\n' {
168 consecutive += 1;
169 if consecutive <= 2 {
170 cleaned.push(c);
171 }
172 } else {
173 consecutive = 0;
174 cleaned.push(c);
175 }
176 }
177
178 cleaned.trim().to_string()
179}