code_mesh_core/tool/
web.rs

1//! Web tools implementation
2
3use super::http::{HttpClient, HttpClientBuilder, HttpRequest, sanitize_url};
4use super::{Tool, ToolContext, ToolError, ToolResult};
5use async_trait::async_trait;
6use serde::{Deserialize, Serialize};
7use serde_json::{json, Value};
8use std::time::Duration;
9use url::Url;
10
11/// Maximum response size (5MB)
12const MAX_RESPONSE_SIZE: usize = 5 * 1024 * 1024;
13
14/// Web fetch tool for retrieving content from URLs
15pub struct WebFetchTool {
16    client: Box<dyn HttpClient>,
17}
18
19impl WebFetchTool {
20    pub fn new() -> Result<Self, ToolError> {
21        let client = HttpClientBuilder::new()
22            .rate_limit(2.0) // 2 requests per second
23            .timeout(Duration::from_secs(30))
24            .verify_ssl(true)
25            .build()
26            .map_err(|e| ToolError::Other(e.into()))?;
27        
28        Ok(Self { client })
29    }
30}
31
32#[derive(Debug, Deserialize)]
33struct WebFetchParams {
34    url: String,
35    format: Option<String>, // "text", "markdown", "html"
36    timeout: Option<u64>,
37}
38
39#[async_trait]
40impl Tool for WebFetchTool {
41    fn id(&self) -> &str {
42        "webfetch"
43    }
44    
45    fn description(&self) -> &str {
46        "Fetches content from a specified URL and processes it according to the specified format. Supports HTML text extraction and markdown conversion."
47    }
48    
49    fn parameters_schema(&self) -> Value {
50        json!({
51            "type": "object",
52            "properties": {
53                "url": {
54                    "type": "string",
55                    "description": "The URL to fetch content from (HTTP/HTTPS only)"
56                },
57                "format": {
58                    "type": "string",
59                    "enum": ["text", "markdown", "html"],
60                    "description": "The format to return the content in",
61                    "default": "text"
62                },
63                "timeout": {
64                    "type": "number",
65                    "minimum": 1,
66                    "maximum": 120,
67                    "description": "Optional timeout in seconds (max 120)"
68                }
69            },
70            "required": ["url"]
71        })
72    }
73    
74    async fn execute(&self, args: Value, _ctx: ToolContext) -> Result<ToolResult, ToolError> {
75        let params: WebFetchParams = serde_json::from_value(args)
76            .map_err(|e| ToolError::InvalidParameters(e.to_string()))?;
77        
78        // Sanitize and validate URL for security
79        let url = sanitize_url(&params.url)
80            .map_err(|e| ToolError::PermissionDenied(e.to_string()))?;
81        
82        // Build request with timeout
83        let timeout = Duration::from_secs(params.timeout.unwrap_or(30).min(120));
84        let request = HttpRequest::get(url.clone())
85            .timeout(timeout)
86            .header("Accept".to_string(), "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8".to_string())
87            .header("Accept-Language".to_string(), "en-US,en;q=0.9".to_string());
88        
89        let response = self.client.execute(request).await
90            .map_err(|e| ToolError::ExecutionFailed(format!("Request failed: {}", e)))?;
91        
92        if !response.is_success() {
93            return Err(ToolError::ExecutionFailed(format!("Request failed with status: {}", response.status())));
94        }
95        
96        // Check content length
97        if response.body().len() > MAX_RESPONSE_SIZE {
98            return Err(ToolError::ExecutionFailed("Response too large (exceeds 5MB limit)".to_string()));
99        }
100        
101        let content_type = response.content_type()
102            .cloned()
103            .unwrap_or_else(|| "text/plain".to_string());
104        
105        let text = response.text()
106            .map_err(|e| ToolError::ExecutionFailed(format!("Failed to decode response: {}", e)))?;
107        
108        let format = params.format.as_deref().unwrap_or("text");
109        
110        let output = match format {
111            "text" => {
112                if content_type.contains("text/html") {
113                    extract_text_from_html(&text)?
114                } else {
115                    text
116                }
117            },
118            "markdown" => {
119                if content_type.contains("text/html") {
120                    convert_html_to_markdown(&text)?
121                } else {
122                    format!("```\n{}\n```", text)
123                }
124            },
125            "html" => text,
126            _ => return Err(ToolError::InvalidParameters("Invalid format specified".to_string())),
127        };
128        
129        Ok(ToolResult {
130            title: format!("{} ({})", params.url, content_type),
131            output,
132            metadata: json!({
133                "url": params.url,
134                "content_type": content_type,
135                "size": response.body().len(),
136                "format": format,
137                "status": response.status()
138            }),
139        })
140    }
141}
142
143/// Web search tool for searching the internet
144pub struct WebSearchTool {
145    client: Box<dyn HttpClient>,
146}
147
148impl WebSearchTool {
149    pub fn new() -> Result<Self, ToolError> {
150        let client = HttpClientBuilder::new()
151            .rate_limit(1.0) // 1 request per second for search
152            .timeout(Duration::from_secs(30))
153            .verify_ssl(true)
154            .build()
155            .map_err(|e| ToolError::Other(e.into()))?;
156        
157        Ok(Self { client })
158    }
159    
160    /// Search using DuckDuckGo Instant Answer API
161    async fn search_duckduckgo(&self, query: &str, max_results: u32) -> Result<Vec<SearchResult>, ToolError> {
162        let search_url = format!(
163            "https://api.duckduckgo.com/?q={}&format=json&no_html=1&skip_disambig=1",
164            urlencoding::encode(query)
165        );
166        
167        let url = Url::parse(&search_url)
168            .map_err(|e| ToolError::ExecutionFailed(format!("Invalid search URL: {}", e)))?;
169        
170        let request = HttpRequest::get(url)
171            .header("Accept".to_string(), "application/json".to_string());
172        
173        let response = self.client.execute(request).await
174            .map_err(|e| ToolError::ExecutionFailed(format!("Search request failed: {}", e)))?;
175        
176        if !response.is_success() {
177            return Err(ToolError::ExecutionFailed(format!("Search failed with status: {}", response.status())));
178        }
179        
180        let search_response: DuckDuckGoResponse = response.json()
181            .map_err(|e| ToolError::ExecutionFailed(format!("Failed to parse search response: {}", e)))?;
182        
183        let mut results = Vec::new();
184        
185        // Add instant answer if available
186        if !search_response.answer.is_empty() {
187            results.push(SearchResult {
188                title: "Instant Answer".to_string(),
189                url: search_response.answer_url.unwrap_or_else(|| "https://duckduckgo.com".to_string()),
190                description: search_response.answer,
191                rank: 1,
192                source: "DuckDuckGo".to_string(),
193            });
194        }
195        
196        // Add abstract if available
197        if !search_response.abstract_text.is_empty() {
198            results.push(SearchResult {
199                title: search_response.heading.unwrap_or_else(|| "Summary".to_string()),
200                url: search_response.abstract_url.unwrap_or_else(|| "https://duckduckgo.com".to_string()),
201                description: search_response.abstract_text,
202                rank: results.len() as u32 + 1,
203                source: "DuckDuckGo".to_string(),
204            });
205        }
206        
207        // Add related topics
208        for (i, topic) in search_response.related_topics.iter().take(max_results as usize).enumerate() {
209            if !topic.text.is_empty() {
210                results.push(SearchResult {
211                    title: format!("Related: {}", topic.first_url.split('/').last().unwrap_or("Topic")),
212                    url: topic.first_url.clone(),
213                    description: topic.text.clone(),
214                    rank: results.len() as u32 + 1,
215                    source: "DuckDuckGo".to_string(),
216                });
217            }
218        }
219        
220        Ok(results.into_iter().take(max_results as usize).collect())
221    }
222}
223
224#[derive(Debug, Deserialize)]
225struct WebSearchParams {
226    query: String,
227    max_results: Option<u32>,
228    language: Option<String>,
229    provider: Option<String>, // "duckduckgo", "bing", "google"
230}
231
232#[derive(Debug, Serialize, Deserialize, Clone)]
233struct SearchResult {
234    title: String,
235    url: String,
236    description: String,
237    rank: u32,
238    source: String,
239}
240
241#[derive(Debug, Deserialize)]
242struct DuckDuckGoResponse {
243    #[serde(rename = "Answer")]
244    answer: String,
245    #[serde(rename = "AnswerURL")]
246    answer_url: Option<String>,
247    #[serde(rename = "Abstract")]
248    abstract_text: String,
249    #[serde(rename = "AbstractURL")]
250    abstract_url: Option<String>,
251    #[serde(rename = "Heading")]
252    heading: Option<String>,
253    #[serde(rename = "RelatedTopics")]
254    related_topics: Vec<RelatedTopic>,
255}
256
257#[derive(Debug, Deserialize)]
258struct RelatedTopic {
259    #[serde(rename = "Text")]
260    text: String,
261    #[serde(rename = "FirstURL")]
262    first_url: String,
263}
264
265#[async_trait]
266impl Tool for WebSearchTool {
267    fn id(&self) -> &str {
268        "websearch"
269    }
270    
271    fn description(&self) -> &str {
272        "Searches the web using various search providers and returns formatted search results"
273    }
274    
275    fn parameters_schema(&self) -> Value {
276        json!({
277            "type": "object",
278            "properties": {
279                "query": {
280                    "type": "string",
281                    "description": "The search query"
282                },
283                "max_results": {
284                    "type": "number",
285                    "minimum": 1,
286                    "maximum": 20,
287                    "default": 10,
288                    "description": "Maximum number of results to return"
289                },
290                "language": {
291                    "type": "string",
292                    "default": "en",
293                    "description": "Language for search results"
294                },
295                "provider": {
296                    "type": "string",
297                    "enum": ["duckduckgo", "auto"],
298                    "default": "duckduckgo",
299                    "description": "Search provider to use"
300                }
301            },
302            "required": ["query"]
303        })
304    }
305    
306    async fn execute(&self, args: Value, _ctx: ToolContext) -> Result<ToolResult, ToolError> {
307        let params: WebSearchParams = serde_json::from_value(args)
308            .map_err(|e| ToolError::InvalidParameters(e.to_string()))?;
309        
310        let max_results = params.max_results.unwrap_or(10).min(20);
311        let provider = params.provider.as_deref().unwrap_or("duckduckgo");
312        
313        let results = match provider {
314            "duckduckgo" | "auto" => {
315                self.search_duckduckgo(&params.query, max_results).await?
316            },
317            _ => {
318                return Err(ToolError::InvalidParameters(format!("Unsupported search provider: {}", provider)));
319            }
320        };
321        
322        let output = if results.is_empty() {
323            format!("No search results found for query: {}", params.query)
324        } else {
325            let mut output = format!("Search results for: {}\n\n", params.query);
326            for result in &results {
327                output.push_str(&format!(
328                    "{}. **{}**\n   URL: {}\n   {}\n   Source: {}\n\n",
329                    result.rank,
330                    result.title,
331                    result.url,
332                    result.description,
333                    result.source
334                ));
335            }
336            output
337        };
338        
339        Ok(ToolResult {
340            title: format!("Search results for: {}", params.query),
341            output,
342            metadata: json!({
343                "query": params.query,
344                "results_count": results.len(),
345                "max_results": max_results,
346                "language": params.language.unwrap_or_else(|| "en".to_string()),
347                "provider": provider,
348                "results": results
349            }),
350        })
351    }
352}
353
354/// Extract text content from HTML using scraper
355fn extract_text_from_html(html: &str) -> Result<String, ToolError> {
356    use scraper::{Html, Selector};
357    
358    let document = Html::parse_document(html);
359    
360    // Remove script and style content
361    let script_selector = Selector::parse("script, style, noscript").unwrap();
362    let mut clean_html = html.to_string();
363    
364    for element in document.select(&script_selector) {
365        if let Some(html_content) = element.html().get(0..element.html().len()) {
366            clean_html = clean_html.replace(html_content, "");
367        }
368    }
369    
370    let clean_document = Html::parse_document(&clean_html);
371    let body_selector = Selector::parse("body").unwrap();
372    
373    let text = if let Some(body) = clean_document.select(&body_selector).next() {
374        body.text().collect::<Vec<_>>().join(" ")
375    } else {
376        // Fallback to whole document
377        clean_document.root_element().text().collect::<Vec<_>>().join(" ")
378    };
379    
380    // Clean up whitespace
381    let re = regex::Regex::new(r"\s+").unwrap();
382    let cleaned = re.replace_all(&text, " ");
383    
384    Ok(cleaned.trim().to_string())
385}
386
387/// Convert HTML to Markdown using html2md
388fn convert_html_to_markdown(html: &str) -> Result<String, ToolError> {
389    // Clean the HTML first
390    let clean_html = clean_html_for_markdown(html);
391    
392    // Convert to markdown
393    let markdown = html2md::parse_html(&clean_html);
394    
395    // Clean up the markdown
396    let re = regex::Regex::new(r"\n\s*\n\s*\n").unwrap();
397    let cleaned = re.replace_all(&markdown, "\n\n");
398    
399    Ok(cleaned.trim().to_string())
400}
401
402/// Clean HTML before markdown conversion
403fn clean_html_for_markdown(html: &str) -> String {
404    let mut cleaned = html.to_string();
405    
406    // Remove script and style tags
407    let re = regex::Regex::new(r"(?s)<(script|style|noscript)[^>]*>.*?</\1>").unwrap();
408    cleaned = re.replace_all(&cleaned, "").to_string();
409    
410    // Remove comments
411    let re = regex::Regex::new(r"(?s)<!--.*?-->").unwrap();
412    cleaned = re.replace_all(&cleaned, "").to_string();
413    
414    // Clean up attributes we don't need
415    let re = regex::Regex::new(r#"\s+(class|id|style|onclick|onload)="[^"]*""#).unwrap();
416    cleaned = re.replace_all(&cleaned, "").to_string();
417    
418    cleaned
419}