Skip to main content

matrixcode_core/tools/
websearch.rs

1use anyhow::Result;
2use async_trait::async_trait;
3use regex::Regex;
4use serde::{Deserialize, Serialize};
5use serde_json::{Value, json};
6
7use super::{Tool, ToolDefinition};
8
9/// Client-side web search tool using DuckDuckGo HTML search.
10/// This tool performs web searches without requiring any API key.
11pub struct WebSearchTool;
12
13#[async_trait]
14impl Tool for WebSearchTool {
15    fn definition(&self) -> ToolDefinition {
16        ToolDefinition {
17            name: "websearch".to_string(),
18            description: "使用 DuckDuckGo 搜索网络信息。返回包含标题、URL 和摘要的搜索结果列表。用于查找互联网上的最新信息。".to_string(),
19            parameters: json!({
20                "type": "object",
21                "properties": {
22                    "query": {
23                        "type": "string",
24                        "description": "搜索查询"
25                    },
26                    "max_results": {
27                        "type": "integer",
28                        "description": "最大返回结果数(默认 5,最大 10)"
29                    }
30                },
31                "required": ["query"]
32            }),
33        }
34    }
35
36    async fn execute(&self, params: Value) -> Result<String> {
37        let query = params["query"]
38            .as_str()
39            .ok_or_else(|| anyhow::anyhow!("missing 'query' parameter"))?;
40        let max_results = params["max_results"].as_u64().unwrap_or(5).min(10) as usize;
41
42        // Show spinner while searching - RAII guard ensures cleanup on error
43        // let mut spinner = ToolSpinner::new(&format!("web-searching '{}'", query));
44
45        let results = search_duckduckgo(query, max_results).await?;
46
47        if results.is_empty() {
48            // spinner.finish_success("0 results");
49            return Ok("No results found.".to_string());
50        }
51
52        let output = results
53            .iter()
54            .enumerate()
55            .map(|(i, r)| {
56                let mut s = format!("{}. {}\n   {}", i + 1, r.title, r.url);
57                if let Some(ref snippet) = r.snippet {
58                    s.push_str(&format!("\n   {}", snippet));
59                }
60                s
61            })
62            .collect::<Vec<_>>()
63            .join("\n\n");
64
65        // spinner.finish_success(&format!("{} results", results.len()));
66        Ok(output)
67    }
68}
69
70/// A single search result.
71#[derive(Debug, Clone, Serialize, Deserialize)]
72struct SearchResult {
73    title: String,
74    url: String,
75    snippet: Option<String>,
76}
77
78/// Perform a web search using DuckDuckGo HTML interface.
79async fn search_duckduckgo(query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
80    let client = reqwest::Client::builder()
81        .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
82        .build()?;
83
84    let url = format!(
85        "https://html.duckduckgo.com/html/?q={}",
86        urlencoding_encode(query)
87    );
88
89    let response = client.get(&url).send().await?;
90
91    if !response.status().is_success() {
92        anyhow::bail!("Search request failed with status: {}", response.status());
93    }
94
95    let html = response.text().await?;
96    let results = parse_ddg_html(&html, max_results);
97
98    Ok(results)
99}
100
101/// Parse DuckDuckGo HTML search results.
102fn parse_ddg_html(html: &str, max_results: usize) -> Vec<SearchResult> {
103    let mut results = Vec::new();
104
105    // DuckDuckGo HTML results are in <div class="result"> elements
106    // Each result contains:
107    // - <a class="result__a"> for the title and URL
108    // - <a class="result__snippet"> for the snippet
109
110    let _result_div_regex =
111        Regex::new(r#"<div[^>]*class="[^"]*result[^"]*"[^>]*>(.*?)</div>\s*</div>"#).ok();
112    let link_regex =
113        Regex::new(r#"<a[^>]*class="[^"]*result__a[^"]*"[^>]*href="([^"]*)"[^>]*>(.*?)</a>"#).ok();
114    let snippet_regex =
115        Regex::new(r#"<a[^>]*class="[^"]*result__snippet[^"]*"[^>]*>(.*?)</a>"#).ok();
116
117    // Alternative simpler parsing: look for result__a links
118    if let Some(ref link_re) = link_regex {
119        for cap in link_re.captures_iter(html) {
120            if results.len() >= max_results {
121                break;
122            }
123
124            let url = cap
125                .get(1)
126                .map(|m| clean_url(m.as_str()))
127                .unwrap_or_default();
128            let title = cap
129                .get(2)
130                .map(|m| strip_html_tags(m.as_str()))
131                .unwrap_or_default();
132
133            // Skip ad results and empty URLs
134            if url.is_empty() || title.is_empty() || url.contains("duckduckgo.com") {
135                continue;
136            }
137
138            // Try to find snippet near this result
139            let snippet = snippet_regex.as_ref().and_then(|snip_re| {
140                snip_re
141                    .captures_iter(html)
142                    .find(|c| {
143                        if let Some(m) = c.get(0) {
144                            // Check if snippet is after current link position in HTML
145                            let link_pos = cap.get(0).unwrap().start();
146                            let snip_pos = m.start();
147                            snip_pos > link_pos && snip_pos < link_pos + 1000
148                        } else {
149                            false
150                        }
151                    })
152                    .and_then(|c| c.get(1).map(|m| strip_html_tags(m.as_str())))
153            });
154
155            results.push(SearchResult {
156                title,
157                url,
158                snippet,
159            });
160        }
161    }
162
163    // If simple parsing didn't work well, try alternative approach
164    if results.is_empty() {
165        // Fallback: parse using a more lenient pattern
166        let alt_link_re =
167            Regex::new(r#"<a[^>]*class="[^"]*result[^"]*"[^>]*href="([^"]*)"[^>]*>([^<]*)</a>"#)
168                .ok();
169        if let Some(re) = alt_link_re {
170            for cap in re.captures_iter(html) {
171                if results.len() >= max_results {
172                    break;
173                }
174
175                let url = clean_url(cap.get(1).map(|m| m.as_str()).unwrap_or_default());
176                let title = cap
177                    .get(2)
178                    .map(|m| strip_html_tags(m.as_str()))
179                    .unwrap_or_default();
180
181                if url.is_empty() || title.is_empty() || url.contains("duckduckgo.com") {
182                    continue;
183                }
184
185                results.push(SearchResult {
186                    title,
187                    url,
188                    snippet: None,
189                });
190            }
191        }
192    }
193
194    results
195}
196
197/// Clean DuckDuckGo redirect URLs to get the actual URL.
198fn clean_url(url: &str) -> String {
199    // DuckDuckGo uses redirect URLs like:
200    // https://duckduckgo.com/l/?uddg=ENCODED_URL&rut=...
201    if url.contains("duckduckgo.com/l/")
202        && let Some(query) = url.split("uddg=").nth(1)
203        && let Some(encoded) = query.split('&').next()
204        && let Ok(decoded) = urlencoding_decode(encoded)
205    {
206        return decoded;
207    }
208    url.to_string()
209}
210
211fn urlencoding_encode(s: &str) -> String {
212    let mut result = String::new();
213    for c in s.chars() {
214        match c {
215            'A'..='Z' | 'a'..='z' | '0'..='9' | '-' | '_' | '.' | '~' => result.push(c),
216            ' ' => result.push('+'),
217            _ => {
218                for byte in c.to_string().as_bytes() {
219                    result.push_str(&format!("%{:02X}", byte));
220                }
221            }
222        }
223    }
224    result
225}
226
227/// Decode URL encoding.
228fn urlencoding_decode(s: &str) -> Result<String> {
229    let decoded = urlencoding_decode_simple(s);
230    Ok(decoded)
231}
232
233/// Simple URL decoding without external crate.
234/// Correctly handles multi-byte UTF-8 sequences (e.g. %E4%B8%AD → 中).
235fn urlencoding_decode_simple(s: &str) -> String {
236    let mut bytes: Vec<u8> = Vec::new();
237    let mut chars = s.chars().peekable();
238
239    while let Some(c) = chars.next() {
240        if c == '%' {
241            let hex: String = chars.by_ref().take(2).collect();
242            if let Ok(byte) = u8::from_str_radix(&hex, 16) {
243                bytes.push(byte);
244            } else {
245                // Invalid hex, push literal
246                bytes.push(b'%');
247                bytes.extend_from_slice(hex.as_bytes());
248            }
249        } else if c == '+' {
250            bytes.push(b' ');
251        } else {
252            let mut buf = [0u8; 4];
253            let encoded = c.encode_utf8(&mut buf);
254            bytes.extend_from_slice(encoded.as_bytes());
255        }
256    }
257
258    String::from_utf8(bytes).unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
259}
260
261/// Strip HTML tags from a string.
262fn strip_html_tags(s: &str) -> String {
263    // Remove HTML tags
264    let re = Regex::new(r"<[^>]*>").unwrap();
265    let without_tags = re.replace_all(s, "");
266
267    // Decode common HTML entities
268    without_tags
269        .replace("&amp;", "&")
270        .replace("&lt;", "<")
271        .replace("&gt;", ">")
272        .replace("&quot;", "\"")
273        .replace("&#39;", "'")
274        .replace("&nbsp;", " ")
275        .trim()
276        .to_string()
277}
278
279#[cfg(test)]
280mod tests {
281    use super::*;
282
283    #[test]
284    fn test_strip_html_tags() {
285        assert_eq!(strip_html_tags("<b>hello</b>"), "hello");
286        assert_eq!(strip_html_tags("a &amp; b"), "a & b");
287        assert_eq!(strip_html_tags("  <span>test</span>  "), "test");
288    }
289
290    #[test]
291    fn test_urlencoding_decode() {
292        assert_eq!(urlencoding_decode_simple("hello%20world"), "hello world");
293        assert_eq!(urlencoding_decode_simple("a+b"), "a b");
294        assert_eq!(urlencoding_decode_simple("%3Ctest%3E"), "<test>");
295    }
296
297    #[test]
298    fn test_clean_url() {
299        let redirect_url = "https://duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com&rut=abc";
300        assert_eq!(clean_url(redirect_url), "https://example.com");
301
302        let normal_url = "https://example.com/page";
303        assert_eq!(clean_url(normal_url), "https://example.com/page");
304    }
305}