Skip to main content

matrixcode_core/tools/
websearch.rs

1use anyhow::Result;
2use async_trait::async_trait;
3use regex::Regex;
4use serde::{Deserialize, Serialize};
5use serde_json::{Value, json};
6
7use super::{Tool, ToolDefinition};
8
9/// Client-side web search tool using DuckDuckGo HTML search.
10/// This tool performs web searches without requiring any API key.
11pub struct WebSearchTool;
12
13#[async_trait]
14impl Tool for WebSearchTool {
15    fn definition(&self) -> ToolDefinition {
16        ToolDefinition {
17            name: "websearch".to_string(),
18            description: "使用 DuckDuckGo 搜索网络信息。返回包含标题、URL 和摘要的搜索结果列表。用于查找互联网上的最新信息。".to_string(),
19            parameters: json!({
20                "type": "object",
21                "properties": {
22                    "query": {
23                        "type": "string",
24                        "description": "搜索查询"
25                    },
26                    "max_results": {
27                        "type": "integer",
28                        "description": "最大返回结果数(默认 5,最大 10)"
29                    }
30                },
31                "required": ["query"]
32            }),
33        }
34    }
35
36    async fn execute(&self, params: Value) -> Result<String> {
37        let query = params["query"].as_str().ok_or_else(|| anyhow::anyhow!("missing 'query' parameter"))?;
38        let max_results = params["max_results"].as_u64().unwrap_or(5).min(10) as usize;
39
40        // Show spinner while searching - RAII guard ensures cleanup on error
41        // let mut spinner = ToolSpinner::new(&format!("web-searching '{}'", query));
42
43        let results = search_duckduckgo(query, max_results).await?;
44
45        if results.is_empty() {
46            // spinner.finish_success("0 results");
47            return Ok("No results found.".to_string());
48        }
49
50        let output = results
51            .iter()
52            .enumerate()
53            .map(|(i, r)| {
54                let mut s = format!("{}. {}\n   {}", i + 1, r.title, r.url);
55                if let Some(ref snippet) = r.snippet {
56                    s.push_str(&format!("\n   {}", snippet));
57                }
58                s
59            })
60            .collect::<Vec<_>>()
61            .join("\n\n");
62
63        // spinner.finish_success(&format!("{} results", results.len()));
64        Ok(output)
65    }
66}
67
68/// A single search result.
69#[derive(Debug, Clone, Serialize, Deserialize)]
70struct SearchResult {
71    title: String,
72    url: String,
73    snippet: Option<String>,
74}
75
76/// Perform a web search using DuckDuckGo HTML interface.
77async fn search_duckduckgo(query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
78    let client = reqwest::Client::builder()
79        .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
80        .build()?;
81
82    let url = format!("https://html.duckduckgo.com/html/?q={}", urlencoding_encode(query));
83
84    let response = client
85        .get(&url)
86        .send()
87        .await?;
88
89    if !response.status().is_success() {
90        anyhow::bail!("Search request failed with status: {}", response.status());
91    }
92
93    let html = response.text().await?;
94    let results = parse_ddg_html(&html, max_results);
95
96    Ok(results)
97}
98
99/// Parse DuckDuckGo HTML search results.
100fn parse_ddg_html(html: &str, max_results: usize) -> Vec<SearchResult> {
101    let mut results = Vec::new();
102
103    // DuckDuckGo HTML results are in <div class="result"> elements
104    // Each result contains:
105    // - <a class="result__a"> for the title and URL
106    // - <a class="result__snippet"> for the snippet
107
108    let _result_div_regex = Regex::new(r#"<div[^>]*class="[^"]*result[^"]*"[^>]*>(.*?)</div>\s*</div>"#).ok();
109    let link_regex = Regex::new(r#"<a[^>]*class="[^"]*result__a[^"]*"[^>]*href="([^"]*)"[^>]*>(.*?)</a>"#).ok();
110    let snippet_regex = Regex::new(r#"<a[^>]*class="[^"]*result__snippet[^"]*"[^>]*>(.*?)</a>"#).ok();
111
112    // Alternative simpler parsing: look for result__a links
113    if let Some(ref link_re) = link_regex {
114        for cap in link_re.captures_iter(html) {
115            if results.len() >= max_results {
116                break;
117            }
118
119            let url = cap.get(1).map(|m| clean_url(m.as_str())).unwrap_or_default();
120            let title = cap.get(2).map(|m| strip_html_tags(m.as_str())).unwrap_or_default();
121
122            // Skip ad results and empty URLs
123            if url.is_empty() || title.is_empty() || url.contains("duckduckgo.com") {
124                continue;
125            }
126
127            // Try to find snippet near this result
128            let snippet = snippet_regex.as_ref().and_then(|snip_re| {
129                snip_re.captures_iter(html)
130                    .find(|c| {
131                        if let Some(m) = c.get(0) {
132                            // Check if snippet is after current link position in HTML
133                            let link_pos = cap.get(0).unwrap().start();
134                            let snip_pos = m.start();
135                            snip_pos > link_pos && snip_pos < link_pos + 1000
136                        } else {
137                            false
138                        }
139                    })
140                    .and_then(|c| c.get(1).map(|m| strip_html_tags(m.as_str())))
141            });
142
143            results.push(SearchResult {
144                title,
145                url,
146                snippet,
147            });
148        }
149    }
150
151    // If simple parsing didn't work well, try alternative approach
152    if results.is_empty() {
153        // Fallback: parse using a more lenient pattern
154        let alt_link_re = Regex::new(r#"<a[^>]*class="[^"]*result[^"]*"[^>]*href="([^"]*)"[^>]*>([^<]*)</a>"#).ok();
155        if let Some(re) = alt_link_re {
156            for cap in re.captures_iter(html) {
157                if results.len() >= max_results {
158                    break;
159                }
160
161                let url = clean_url(cap.get(1).map(|m| m.as_str()).unwrap_or_default());
162                let title = cap.get(2).map(|m| strip_html_tags(m.as_str())).unwrap_or_default();
163
164                if url.is_empty() || title.is_empty() || url.contains("duckduckgo.com") {
165                    continue;
166                }
167
168                results.push(SearchResult {
169                    title,
170                    url,
171                    snippet: None,
172                });
173            }
174        }
175    }
176
177    results
178}
179
180/// Clean DuckDuckGo redirect URLs to get the actual URL.
181fn clean_url(url: &str) -> String {
182    // DuckDuckGo uses redirect URLs like:
183    // https://duckduckgo.com/l/?uddg=ENCODED_URL&rut=...
184    if url.contains("duckduckgo.com/l/")
185        && let Some(query) = url.split("uddg=").nth(1)
186            && let Some(encoded) = query.split('&').next()
187                && let Ok(decoded) = urlencoding_decode(encoded) {
188                    return decoded;
189                }
190    url.to_string()
191}
192
193fn urlencoding_encode(s: &str) -> String {
194    let mut result = String::new();
195    for c in s.chars() {
196        match c {
197            'A'..='Z' | 'a'..='z' | '0'..='9' | '-' | '_' | '.' | '~' => result.push(c),
198            ' ' => result.push('+'),
199            _ => {
200                for byte in c.to_string().as_bytes() {
201                    result.push_str(&format!("%{:02X}", byte));
202                }
203            }
204        }
205    }
206    result
207}
208
209/// Decode URL encoding.
210fn urlencoding_decode(s: &str) -> Result<String> {
211    let decoded = urlencoding_decode_simple(s);
212    Ok(decoded)
213}
214
215/// Simple URL decoding without external crate.
216/// Correctly handles multi-byte UTF-8 sequences (e.g. %E4%B8%AD → 中).
217fn urlencoding_decode_simple(s: &str) -> String {
218    let mut bytes: Vec<u8> = Vec::new();
219    let mut chars = s.chars().peekable();
220
221    while let Some(c) = chars.next() {
222        if c == '%' {
223            let hex: String = chars.by_ref().take(2).collect();
224            if let Ok(byte) = u8::from_str_radix(&hex, 16) {
225                bytes.push(byte);
226            } else {
227                // Invalid hex, push literal
228                bytes.push(b'%');
229                bytes.extend_from_slice(hex.as_bytes());
230            }
231        } else if c == '+' {
232            bytes.push(b' ');
233        } else {
234            let mut buf = [0u8; 4];
235            let encoded = c.encode_utf8(&mut buf);
236            bytes.extend_from_slice(encoded.as_bytes());
237        }
238    }
239
240    String::from_utf8(bytes).unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
241}
242
243/// Strip HTML tags from a string.
244fn strip_html_tags(s: &str) -> String {
245    // Remove HTML tags
246    let re = Regex::new(r"<[^>]*>").unwrap();
247    let without_tags = re.replace_all(s, "");
248
249    // Decode common HTML entities
250    without_tags
251        .replace("&amp;", "&")
252        .replace("&lt;", "<")
253        .replace("&gt;", ">")
254        .replace("&quot;", "\"")
255        .replace("&#39;", "'")
256        .replace("&nbsp;", " ")
257        .trim()
258        .to_string()
259}
260
261#[cfg(test)]
262mod tests {
263    use super::*;
264
265    #[test]
266    fn test_strip_html_tags() {
267        assert_eq!(strip_html_tags("<b>hello</b>"), "hello");
268        assert_eq!(strip_html_tags("a &amp; b"), "a & b");
269        assert_eq!(strip_html_tags("  <span>test</span>  "), "test");
270    }
271
272    #[test]
273    fn test_urlencoding_decode() {
274        assert_eq!(urlencoding_decode_simple("hello%20world"), "hello world");
275        assert_eq!(urlencoding_decode_simple("a+b"), "a b");
276        assert_eq!(urlencoding_decode_simple("%3Ctest%3E"), "<test>");
277    }
278
279    #[test]
280    fn test_clean_url() {
281        let redirect_url = "https://duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com&rut=abc";
282        assert_eq!(clean_url(redirect_url), "https://example.com");
283
284        let normal_url = "https://example.com/page";
285        assert_eq!(clean_url(normal_url), "https://example.com/page");
286    }
287}