matrixcode_core/tools/
websearch.rs1use anyhow::Result;
2use async_trait::async_trait;
3use regex::Regex;
4use serde::{Deserialize, Serialize};
5use serde_json::{Value, json};
6
7use super::{Tool, ToolDefinition};
8
9pub struct WebSearchTool;
12
13#[async_trait]
14impl Tool for WebSearchTool {
15 fn definition(&self) -> ToolDefinition {
16 ToolDefinition {
17 name: "websearch".to_string(),
18 description: "使用 DuckDuckGo 搜索网络信息。返回包含标题、URL 和摘要的搜索结果列表。用于查找互联网上的最新信息。".to_string(),
19 parameters: json!({
20 "type": "object",
21 "properties": {
22 "query": {
23 "type": "string",
24 "description": "搜索查询"
25 },
26 "max_results": {
27 "type": "integer",
28 "description": "最大返回结果数(默认 5,最大 10)"
29 }
30 },
31 "required": ["query"]
32 }),
33 }
34 }
35
36 async fn execute(&self, params: Value) -> Result<String> {
37 let query = params["query"]
38 .as_str()
39 .ok_or_else(|| anyhow::anyhow!("missing 'query' parameter"))?;
40 let max_results = params["max_results"].as_u64().unwrap_or(5).min(10) as usize;
41
42 let results = search_duckduckgo(query, max_results).await?;
46
47 if results.is_empty() {
48 return Ok("No results found.".to_string());
50 }
51
52 let output = results
53 .iter()
54 .enumerate()
55 .map(|(i, r)| {
56 let mut s = format!("{}. {}\n {}", i + 1, r.title, r.url);
57 if let Some(ref snippet) = r.snippet {
58 s.push_str(&format!("\n {}", snippet));
59 }
60 s
61 })
62 .collect::<Vec<_>>()
63 .join("\n\n");
64
65 Ok(output)
67 }
68}
69
70#[derive(Debug, Clone, Serialize, Deserialize)]
72struct SearchResult {
73 title: String,
74 url: String,
75 snippet: Option<String>,
76}
77
78async fn search_duckduckgo(query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
80 let client = reqwest::Client::builder()
81 .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
82 .build()?;
83
84 let url = format!(
85 "https://html.duckduckgo.com/html/?q={}",
86 urlencoding_encode(query)
87 );
88
89 let response = client.get(&url).send().await?;
90
91 if !response.status().is_success() {
92 anyhow::bail!("Search request failed with status: {}", response.status());
93 }
94
95 let html = response.text().await?;
96 let results = parse_ddg_html(&html, max_results);
97
98 Ok(results)
99}
100
101fn parse_ddg_html(html: &str, max_results: usize) -> Vec<SearchResult> {
103 let mut results = Vec::new();
104
105 let _result_div_regex =
111 Regex::new(r#"<div[^>]*class="[^"]*result[^"]*"[^>]*>(.*?)</div>\s*</div>"#).ok();
112 let link_regex =
113 Regex::new(r#"<a[^>]*class="[^"]*result__a[^"]*"[^>]*href="([^"]*)"[^>]*>(.*?)</a>"#).ok();
114 let snippet_regex =
115 Regex::new(r#"<a[^>]*class="[^"]*result__snippet[^"]*"[^>]*>(.*?)</a>"#).ok();
116
117 if let Some(ref link_re) = link_regex {
119 for cap in link_re.captures_iter(html) {
120 if results.len() >= max_results {
121 break;
122 }
123
124 let url = cap
125 .get(1)
126 .map(|m| clean_url(m.as_str()))
127 .unwrap_or_default();
128 let title = cap
129 .get(2)
130 .map(|m| strip_html_tags(m.as_str()))
131 .unwrap_or_default();
132
133 if url.is_empty() || title.is_empty() || url.contains("duckduckgo.com") {
135 continue;
136 }
137
138 let snippet = snippet_regex.as_ref().and_then(|snip_re| {
140 snip_re
141 .captures_iter(html)
142 .find(|c| {
143 if let Some(m) = c.get(0) {
144 let link_pos = cap.get(0).unwrap().start();
146 let snip_pos = m.start();
147 snip_pos > link_pos && snip_pos < link_pos + 1000
148 } else {
149 false
150 }
151 })
152 .and_then(|c| c.get(1).map(|m| strip_html_tags(m.as_str())))
153 });
154
155 results.push(SearchResult {
156 title,
157 url,
158 snippet,
159 });
160 }
161 }
162
163 if results.is_empty() {
165 let alt_link_re =
167 Regex::new(r#"<a[^>]*class="[^"]*result[^"]*"[^>]*href="([^"]*)"[^>]*>([^<]*)</a>"#)
168 .ok();
169 if let Some(re) = alt_link_re {
170 for cap in re.captures_iter(html) {
171 if results.len() >= max_results {
172 break;
173 }
174
175 let url = clean_url(cap.get(1).map(|m| m.as_str()).unwrap_or_default());
176 let title = cap
177 .get(2)
178 .map(|m| strip_html_tags(m.as_str()))
179 .unwrap_or_default();
180
181 if url.is_empty() || title.is_empty() || url.contains("duckduckgo.com") {
182 continue;
183 }
184
185 results.push(SearchResult {
186 title,
187 url,
188 snippet: None,
189 });
190 }
191 }
192 }
193
194 results
195}
196
197fn clean_url(url: &str) -> String {
199 if url.contains("duckduckgo.com/l/")
202 && let Some(query) = url.split("uddg=").nth(1)
203 && let Some(encoded) = query.split('&').next()
204 && let Ok(decoded) = urlencoding_decode(encoded)
205 {
206 return decoded;
207 }
208 url.to_string()
209}
210
211fn urlencoding_encode(s: &str) -> String {
212 let mut result = String::new();
213 for c in s.chars() {
214 match c {
215 'A'..='Z' | 'a'..='z' | '0'..='9' | '-' | '_' | '.' | '~' => result.push(c),
216 ' ' => result.push('+'),
217 _ => {
218 for byte in c.to_string().as_bytes() {
219 result.push_str(&format!("%{:02X}", byte));
220 }
221 }
222 }
223 }
224 result
225}
226
227fn urlencoding_decode(s: &str) -> Result<String> {
229 let decoded = urlencoding_decode_simple(s);
230 Ok(decoded)
231}
232
233fn urlencoding_decode_simple(s: &str) -> String {
236 let mut bytes: Vec<u8> = Vec::new();
237 let mut chars = s.chars().peekable();
238
239 while let Some(c) = chars.next() {
240 if c == '%' {
241 let hex: String = chars.by_ref().take(2).collect();
242 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
243 bytes.push(byte);
244 } else {
245 bytes.push(b'%');
247 bytes.extend_from_slice(hex.as_bytes());
248 }
249 } else if c == '+' {
250 bytes.push(b' ');
251 } else {
252 let mut buf = [0u8; 4];
253 let encoded = c.encode_utf8(&mut buf);
254 bytes.extend_from_slice(encoded.as_bytes());
255 }
256 }
257
258 String::from_utf8(bytes).unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
259}
260
261fn strip_html_tags(s: &str) -> String {
263 let re = Regex::new(r"<[^>]*>").unwrap();
265 let without_tags = re.replace_all(s, "");
266
267 without_tags
269 .replace("&", "&")
270 .replace("<", "<")
271 .replace(">", ">")
272 .replace(""", "\"")
273 .replace("'", "'")
274 .replace(" ", " ")
275 .trim()
276 .to_string()
277}
278
279#[cfg(test)]
280mod tests {
281 use super::*;
282
283 #[test]
284 fn test_strip_html_tags() {
285 assert_eq!(strip_html_tags("<b>hello</b>"), "hello");
286 assert_eq!(strip_html_tags("a & b"), "a & b");
287 assert_eq!(strip_html_tags(" <span>test</span> "), "test");
288 }
289
290 #[test]
291 fn test_urlencoding_decode() {
292 assert_eq!(urlencoding_decode_simple("hello%20world"), "hello world");
293 assert_eq!(urlencoding_decode_simple("a+b"), "a b");
294 assert_eq!(urlencoding_decode_simple("%3Ctest%3E"), "<test>");
295 }
296
297 #[test]
298 fn test_clean_url() {
299 let redirect_url = "https://duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com&rut=abc";
300 assert_eq!(clean_url(redirect_url), "https://example.com");
301
302 let normal_url = "https://example.com/page";
303 assert_eq!(clean_url(normal_url), "https://example.com/page");
304 }
305}