matrixcode_core/tools/
websearch.rs1use anyhow::Result;
2use async_trait::async_trait;
3use regex::Regex;
4use serde::{Deserialize, Serialize};
5use serde_json::{Value, json};
6
7use super::{Tool, ToolDefinition};
8
9pub struct WebSearchTool;
12
13#[async_trait]
14impl Tool for WebSearchTool {
15 fn definition(&self) -> ToolDefinition {
16 ToolDefinition {
17 name: "websearch".to_string(),
18 description: "Search the web for information using DuckDuckGo. Returns a list of search results with titles, URLs, and snippets. Use this tool when you need to find current information on the internet.".to_string(),
19 parameters: json!({
20 "type": "object",
21 "properties": {
22 "query": {
23 "type": "string",
24 "description": "The search query"
25 },
26 "max_results": {
27 "type": "integer",
28 "description": "Maximum number of results to return (default 5, max 10)"
29 }
30 },
31 "required": ["query"]
32 }),
33 }
34 }
35
36 async fn execute(&self, params: Value) -> Result<String> {
37 let query = params["query"].as_str().ok_or_else(|| anyhow::anyhow!("missing 'query' parameter"))?;
38 let max_results = params["max_results"].as_u64().unwrap_or(5).min(10) as usize;
39
40 let results = search_duckduckgo(query, max_results).await?;
44
45 if results.is_empty() {
46 return Ok("No results found.".to_string());
48 }
49
50 let output = results
51 .iter()
52 .enumerate()
53 .map(|(i, r)| {
54 let mut s = format!("{}. {}\n {}", i + 1, r.title, r.url);
55 if let Some(ref snippet) = r.snippet {
56 s.push_str(&format!("\n {}", snippet));
57 }
58 s
59 })
60 .collect::<Vec<_>>()
61 .join("\n\n");
62
63 Ok(output)
65 }
66}
67
68#[derive(Debug, Clone, Serialize, Deserialize)]
70struct SearchResult {
71 title: String,
72 url: String,
73 snippet: Option<String>,
74}
75
76async fn search_duckduckgo(query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
78 let client = reqwest::Client::builder()
79 .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
80 .build()?;
81
82 let url = format!("https://html.duckduckgo.com/html/?q={}", urlencoding_encode(query));
83
84 let response = client
85 .get(&url)
86 .send()
87 .await?;
88
89 if !response.status().is_success() {
90 anyhow::bail!("Search request failed with status: {}", response.status());
91 }
92
93 let html = response.text().await?;
94 let results = parse_ddg_html(&html, max_results);
95
96 Ok(results)
97}
98
99fn parse_ddg_html(html: &str, max_results: usize) -> Vec<SearchResult> {
101 let mut results = Vec::new();
102
103 let _result_div_regex = Regex::new(r#"<div[^>]*class="[^"]*result[^"]*"[^>]*>(.*?)</div>\s*</div>"#).ok();
109 let link_regex = Regex::new(r#"<a[^>]*class="[^"]*result__a[^"]*"[^>]*href="([^"]*)"[^>]*>(.*?)</a>"#).ok();
110 let snippet_regex = Regex::new(r#"<a[^>]*class="[^"]*result__snippet[^"]*"[^>]*>(.*?)</a>"#).ok();
111
112 if let Some(ref link_re) = link_regex {
114 for cap in link_re.captures_iter(html) {
115 if results.len() >= max_results {
116 break;
117 }
118
119 let url = cap.get(1).map(|m| clean_url(m.as_str())).unwrap_or_default();
120 let title = cap.get(2).map(|m| strip_html_tags(m.as_str())).unwrap_or_default();
121
122 if url.is_empty() || title.is_empty() || url.contains("duckduckgo.com") {
124 continue;
125 }
126
127 let snippet = snippet_regex.as_ref().and_then(|snip_re| {
129 snip_re.captures_iter(html)
130 .find(|c| {
131 if let Some(m) = c.get(0) {
132 let link_pos = cap.get(0).unwrap().start();
134 let snip_pos = m.start();
135 snip_pos > link_pos && snip_pos < link_pos + 1000
136 } else {
137 false
138 }
139 })
140 .and_then(|c| c.get(1).map(|m| strip_html_tags(m.as_str())))
141 });
142
143 results.push(SearchResult {
144 title,
145 url,
146 snippet,
147 });
148 }
149 }
150
151 if results.is_empty() {
153 let alt_link_re = Regex::new(r#"<a[^>]*class="[^"]*result[^"]*"[^>]*href="([^"]*)"[^>]*>([^<]*)</a>"#).ok();
155 if let Some(re) = alt_link_re {
156 for cap in re.captures_iter(html) {
157 if results.len() >= max_results {
158 break;
159 }
160
161 let url = clean_url(cap.get(1).map(|m| m.as_str()).unwrap_or_default());
162 let title = cap.get(2).map(|m| strip_html_tags(m.as_str())).unwrap_or_default();
163
164 if url.is_empty() || title.is_empty() || url.contains("duckduckgo.com") {
165 continue;
166 }
167
168 results.push(SearchResult {
169 title,
170 url,
171 snippet: None,
172 });
173 }
174 }
175 }
176
177 results
178}
179
180fn clean_url(url: &str) -> String {
182 if url.contains("duckduckgo.com/l/")
185 && let Some(query) = url.split("uddg=").nth(1)
186 && let Some(encoded) = query.split('&').next()
187 && let Ok(decoded) = urlencoding_decode(encoded) {
188 return decoded;
189 }
190 url.to_string()
191}
192
193fn urlencoding_encode(s: &str) -> String {
194 let mut result = String::new();
195 for c in s.chars() {
196 match c {
197 'A'..='Z' | 'a'..='z' | '0'..='9' | '-' | '_' | '.' | '~' => result.push(c),
198 ' ' => result.push('+'),
199 _ => {
200 for byte in c.to_string().as_bytes() {
201 result.push_str(&format!("%{:02X}", byte));
202 }
203 }
204 }
205 }
206 result
207}
208
209fn urlencoding_decode(s: &str) -> Result<String> {
211 let decoded = urlencoding_decode_simple(s);
212 Ok(decoded)
213}
214
215fn urlencoding_decode_simple(s: &str) -> String {
218 let mut bytes: Vec<u8> = Vec::new();
219 let mut chars = s.chars().peekable();
220
221 while let Some(c) = chars.next() {
222 if c == '%' {
223 let hex: String = chars.by_ref().take(2).collect();
224 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
225 bytes.push(byte);
226 } else {
227 bytes.push(b'%');
229 bytes.extend_from_slice(hex.as_bytes());
230 }
231 } else if c == '+' {
232 bytes.push(b' ');
233 } else {
234 let mut buf = [0u8; 4];
235 let encoded = c.encode_utf8(&mut buf);
236 bytes.extend_from_slice(encoded.as_bytes());
237 }
238 }
239
240 String::from_utf8(bytes).unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
241}
242
243fn strip_html_tags(s: &str) -> String {
245 let re = Regex::new(r"<[^>]*>").unwrap();
247 let without_tags = re.replace_all(s, "");
248
249 without_tags
251 .replace("&", "&")
252 .replace("<", "<")
253 .replace(">", ">")
254 .replace(""", "\"")
255 .replace("'", "'")
256 .replace(" ", " ")
257 .trim()
258 .to_string()
259}
260
261#[cfg(test)]
262mod tests {
263 use super::*;
264
265 #[test]
266 fn test_strip_html_tags() {
267 assert_eq!(strip_html_tags("<b>hello</b>"), "hello");
268 assert_eq!(strip_html_tags("a & b"), "a & b");
269 assert_eq!(strip_html_tags(" <span>test</span> "), "test");
270 }
271
272 #[test]
273 fn test_urlencoding_decode() {
274 assert_eq!(urlencoding_decode_simple("hello%20world"), "hello world");
275 assert_eq!(urlencoding_decode_simple("a+b"), "a b");
276 assert_eq!(urlencoding_decode_simple("%3Ctest%3E"), "<test>");
277 }
278
279 #[test]
280 fn test_clean_url() {
281 let redirect_url = "https://duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com&rut=abc";
282 assert_eq!(clean_url(redirect_url), "https://example.com");
283
284 let normal_url = "https://example.com/page";
285 assert_eq!(clean_url(normal_url), "https://example.com/page");
286 }
287}