1use super::http::{HttpClient, HttpClientBuilder, HttpRequest, sanitize_url};
4use super::{Tool, ToolContext, ToolError, ToolResult};
5use async_trait::async_trait;
6use serde::{Deserialize, Serialize};
7use serde_json::{json, Value};
8use std::time::Duration;
9use url::Url;
10
11const MAX_RESPONSE_SIZE: usize = 5 * 1024 * 1024;
13
14pub struct WebFetchTool {
16 client: Box<dyn HttpClient>,
17}
18
19impl WebFetchTool {
20 pub fn new() -> Result<Self, ToolError> {
21 let client = HttpClientBuilder::new()
22 .rate_limit(2.0) .timeout(Duration::from_secs(30))
24 .verify_ssl(true)
25 .build()
26 .map_err(|e| ToolError::Other(e.into()))?;
27
28 Ok(Self { client })
29 }
30}
31
32#[derive(Debug, Deserialize)]
33struct WebFetchParams {
34 url: String,
35 format: Option<String>, timeout: Option<u64>,
37}
38
39#[async_trait]
40impl Tool for WebFetchTool {
41 fn id(&self) -> &str {
42 "webfetch"
43 }
44
45 fn description(&self) -> &str {
46 "Fetches content from a specified URL and processes it according to the specified format. Supports HTML text extraction and markdown conversion."
47 }
48
49 fn parameters_schema(&self) -> Value {
50 json!({
51 "type": "object",
52 "properties": {
53 "url": {
54 "type": "string",
55 "description": "The URL to fetch content from (HTTP/HTTPS only)"
56 },
57 "format": {
58 "type": "string",
59 "enum": ["text", "markdown", "html"],
60 "description": "The format to return the content in",
61 "default": "text"
62 },
63 "timeout": {
64 "type": "number",
65 "minimum": 1,
66 "maximum": 120,
67 "description": "Optional timeout in seconds (max 120)"
68 }
69 },
70 "required": ["url"]
71 })
72 }
73
74 async fn execute(&self, args: Value, _ctx: ToolContext) -> Result<ToolResult, ToolError> {
75 let params: WebFetchParams = serde_json::from_value(args)
76 .map_err(|e| ToolError::InvalidParameters(e.to_string()))?;
77
78 let url = sanitize_url(¶ms.url)
80 .map_err(|e| ToolError::PermissionDenied(e.to_string()))?;
81
82 let timeout = Duration::from_secs(params.timeout.unwrap_or(30).min(120));
84 let request = HttpRequest::get(url.clone())
85 .timeout(timeout)
86 .header("Accept".to_string(), "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8".to_string())
87 .header("Accept-Language".to_string(), "en-US,en;q=0.9".to_string());
88
89 let response = self.client.execute(request).await
90 .map_err(|e| ToolError::ExecutionFailed(format!("Request failed: {}", e)))?;
91
92 if !response.is_success() {
93 return Err(ToolError::ExecutionFailed(format!("Request failed with status: {}", response.status())));
94 }
95
96 if response.body().len() > MAX_RESPONSE_SIZE {
98 return Err(ToolError::ExecutionFailed("Response too large (exceeds 5MB limit)".to_string()));
99 }
100
101 let content_type = response.content_type()
102 .cloned()
103 .unwrap_or_else(|| "text/plain".to_string());
104
105 let text = response.text()
106 .map_err(|e| ToolError::ExecutionFailed(format!("Failed to decode response: {}", e)))?;
107
108 let format = params.format.as_deref().unwrap_or("text");
109
110 let output = match format {
111 "text" => {
112 if content_type.contains("text/html") {
113 extract_text_from_html(&text)?
114 } else {
115 text
116 }
117 },
118 "markdown" => {
119 if content_type.contains("text/html") {
120 convert_html_to_markdown(&text)?
121 } else {
122 format!("```\n{}\n```", text)
123 }
124 },
125 "html" => text,
126 _ => return Err(ToolError::InvalidParameters("Invalid format specified".to_string())),
127 };
128
129 Ok(ToolResult {
130 title: format!("{} ({})", params.url, content_type),
131 output,
132 metadata: json!({
133 "url": params.url,
134 "content_type": content_type,
135 "size": response.body().len(),
136 "format": format,
137 "status": response.status()
138 }),
139 })
140 }
141}
142
143pub struct WebSearchTool {
145 client: Box<dyn HttpClient>,
146}
147
148impl WebSearchTool {
149 pub fn new() -> Result<Self, ToolError> {
150 let client = HttpClientBuilder::new()
151 .rate_limit(1.0) .timeout(Duration::from_secs(30))
153 .verify_ssl(true)
154 .build()
155 .map_err(|e| ToolError::Other(e.into()))?;
156
157 Ok(Self { client })
158 }
159
160 async fn search_duckduckgo(&self, query: &str, max_results: u32) -> Result<Vec<SearchResult>, ToolError> {
162 let search_url = format!(
163 "https://api.duckduckgo.com/?q={}&format=json&no_html=1&skip_disambig=1",
164 urlencoding::encode(query)
165 );
166
167 let url = Url::parse(&search_url)
168 .map_err(|e| ToolError::ExecutionFailed(format!("Invalid search URL: {}", e)))?;
169
170 let request = HttpRequest::get(url)
171 .header("Accept".to_string(), "application/json".to_string());
172
173 let response = self.client.execute(request).await
174 .map_err(|e| ToolError::ExecutionFailed(format!("Search request failed: {}", e)))?;
175
176 if !response.is_success() {
177 return Err(ToolError::ExecutionFailed(format!("Search failed with status: {}", response.status())));
178 }
179
180 let search_response: DuckDuckGoResponse = response.json()
181 .map_err(|e| ToolError::ExecutionFailed(format!("Failed to parse search response: {}", e)))?;
182
183 let mut results = Vec::new();
184
185 if !search_response.answer.is_empty() {
187 results.push(SearchResult {
188 title: "Instant Answer".to_string(),
189 url: search_response.answer_url.unwrap_or_else(|| "https://duckduckgo.com".to_string()),
190 description: search_response.answer,
191 rank: 1,
192 source: "DuckDuckGo".to_string(),
193 });
194 }
195
196 if !search_response.abstract_text.is_empty() {
198 results.push(SearchResult {
199 title: search_response.heading.unwrap_or_else(|| "Summary".to_string()),
200 url: search_response.abstract_url.unwrap_or_else(|| "https://duckduckgo.com".to_string()),
201 description: search_response.abstract_text,
202 rank: results.len() as u32 + 1,
203 source: "DuckDuckGo".to_string(),
204 });
205 }
206
207 for (i, topic) in search_response.related_topics.iter().take(max_results as usize).enumerate() {
209 if !topic.text.is_empty() {
210 results.push(SearchResult {
211 title: format!("Related: {}", topic.first_url.split('/').last().unwrap_or("Topic")),
212 url: topic.first_url.clone(),
213 description: topic.text.clone(),
214 rank: results.len() as u32 + 1,
215 source: "DuckDuckGo".to_string(),
216 });
217 }
218 }
219
220 Ok(results.into_iter().take(max_results as usize).collect())
221 }
222}
223
224#[derive(Debug, Deserialize)]
225struct WebSearchParams {
226 query: String,
227 max_results: Option<u32>,
228 language: Option<String>,
229 provider: Option<String>, }
231
232#[derive(Debug, Serialize, Deserialize, Clone)]
233struct SearchResult {
234 title: String,
235 url: String,
236 description: String,
237 rank: u32,
238 source: String,
239}
240
241#[derive(Debug, Deserialize)]
242struct DuckDuckGoResponse {
243 #[serde(rename = "Answer")]
244 answer: String,
245 #[serde(rename = "AnswerURL")]
246 answer_url: Option<String>,
247 #[serde(rename = "Abstract")]
248 abstract_text: String,
249 #[serde(rename = "AbstractURL")]
250 abstract_url: Option<String>,
251 #[serde(rename = "Heading")]
252 heading: Option<String>,
253 #[serde(rename = "RelatedTopics")]
254 related_topics: Vec<RelatedTopic>,
255}
256
257#[derive(Debug, Deserialize)]
258struct RelatedTopic {
259 #[serde(rename = "Text")]
260 text: String,
261 #[serde(rename = "FirstURL")]
262 first_url: String,
263}
264
265#[async_trait]
266impl Tool for WebSearchTool {
267 fn id(&self) -> &str {
268 "websearch"
269 }
270
271 fn description(&self) -> &str {
272 "Searches the web using various search providers and returns formatted search results"
273 }
274
275 fn parameters_schema(&self) -> Value {
276 json!({
277 "type": "object",
278 "properties": {
279 "query": {
280 "type": "string",
281 "description": "The search query"
282 },
283 "max_results": {
284 "type": "number",
285 "minimum": 1,
286 "maximum": 20,
287 "default": 10,
288 "description": "Maximum number of results to return"
289 },
290 "language": {
291 "type": "string",
292 "default": "en",
293 "description": "Language for search results"
294 },
295 "provider": {
296 "type": "string",
297 "enum": ["duckduckgo", "auto"],
298 "default": "duckduckgo",
299 "description": "Search provider to use"
300 }
301 },
302 "required": ["query"]
303 })
304 }
305
306 async fn execute(&self, args: Value, _ctx: ToolContext) -> Result<ToolResult, ToolError> {
307 let params: WebSearchParams = serde_json::from_value(args)
308 .map_err(|e| ToolError::InvalidParameters(e.to_string()))?;
309
310 let max_results = params.max_results.unwrap_or(10).min(20);
311 let provider = params.provider.as_deref().unwrap_or("duckduckgo");
312
313 let results = match provider {
314 "duckduckgo" | "auto" => {
315 self.search_duckduckgo(¶ms.query, max_results).await?
316 },
317 _ => {
318 return Err(ToolError::InvalidParameters(format!("Unsupported search provider: {}", provider)));
319 }
320 };
321
322 let output = if results.is_empty() {
323 format!("No search results found for query: {}", params.query)
324 } else {
325 let mut output = format!("Search results for: {}\n\n", params.query);
326 for result in &results {
327 output.push_str(&format!(
328 "{}. **{}**\n URL: {}\n {}\n Source: {}\n\n",
329 result.rank,
330 result.title,
331 result.url,
332 result.description,
333 result.source
334 ));
335 }
336 output
337 };
338
339 Ok(ToolResult {
340 title: format!("Search results for: {}", params.query),
341 output,
342 metadata: json!({
343 "query": params.query,
344 "results_count": results.len(),
345 "max_results": max_results,
346 "language": params.language.unwrap_or_else(|| "en".to_string()),
347 "provider": provider,
348 "results": results
349 }),
350 })
351 }
352}
353
354fn extract_text_from_html(html: &str) -> Result<String, ToolError> {
356 use scraper::{Html, Selector};
357
358 let document = Html::parse_document(html);
359
360 let script_selector = Selector::parse("script, style, noscript").unwrap();
362 let mut clean_html = html.to_string();
363
364 for element in document.select(&script_selector) {
365 if let Some(html_content) = element.html().get(0..element.html().len()) {
366 clean_html = clean_html.replace(html_content, "");
367 }
368 }
369
370 let clean_document = Html::parse_document(&clean_html);
371 let body_selector = Selector::parse("body").unwrap();
372
373 let text = if let Some(body) = clean_document.select(&body_selector).next() {
374 body.text().collect::<Vec<_>>().join(" ")
375 } else {
376 clean_document.root_element().text().collect::<Vec<_>>().join(" ")
378 };
379
380 let re = regex::Regex::new(r"\s+").unwrap();
382 let cleaned = re.replace_all(&text, " ");
383
384 Ok(cleaned.trim().to_string())
385}
386
387fn convert_html_to_markdown(html: &str) -> Result<String, ToolError> {
389 let clean_html = clean_html_for_markdown(html);
391
392 let markdown = html2md::parse_html(&clean_html);
394
395 let re = regex::Regex::new(r"\n\s*\n\s*\n").unwrap();
397 let cleaned = re.replace_all(&markdown, "\n\n");
398
399 Ok(cleaned.trim().to_string())
400}
401
402fn clean_html_for_markdown(html: &str) -> String {
404 let mut cleaned = html.to_string();
405
406 let re = regex::Regex::new(r"(?s)<(script|style|noscript)[^>]*>.*?</\1>").unwrap();
408 cleaned = re.replace_all(&cleaned, "").to_string();
409
410 let re = regex::Regex::new(r"(?s)<!--.*?-->").unwrap();
412 cleaned = re.replace_all(&cleaned, "").to_string();
413
414 let re = regex::Regex::new(r#"\s+(class|id|style|onclick|onload)="[^"]*""#).unwrap();
416 cleaned = re.replace_all(&cleaned, "").to_string();
417
418 cleaned
419}