use anyhow::Result;
use async_trait::async_trait;
use regex::Regex;
use serde::{Deserialize, Serialize};
use serde_json::{Value, json};
use super::{Tool, ToolDefinition};
pub struct WebSearchTool;
#[async_trait]
impl Tool for WebSearchTool {
fn definition(&self) -> ToolDefinition {
ToolDefinition {
name: "websearch".to_string(),
description: "Search the web for information using DuckDuckGo. Returns a list of search results with titles, URLs, and snippets. Use this tool when you need to find current information on the internet.".to_string(),
parameters: json!({
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query"
},
"max_results": {
"type": "integer",
"description": "Maximum number of results to return (default 5, max 10)"
}
},
"required": ["query"]
}),
}
}
async fn execute(&self, params: Value) -> Result<String> {
let query = params["query"].as_str().ok_or_else(|| anyhow::anyhow!("missing 'query' parameter"))?;
let max_results = params["max_results"].as_u64().unwrap_or(5).min(10) as usize;
let results = search_duckduckgo(query, max_results).await?;
if results.is_empty() {
return Ok("No results found.".to_string());
}
let output = results
.iter()
.enumerate()
.map(|(i, r)| {
let mut s = format!("{}. {}\n {}", i + 1, r.title, r.url);
if let Some(ref snippet) = r.snippet {
s.push_str(&format!("\n {}", snippet));
}
s
})
.collect::<Vec<_>>()
.join("\n\n");
Ok(output)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct SearchResult {
title: String,
url: String,
snippet: Option<String>,
}
async fn search_duckduckgo(query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
let client = reqwest::Client::builder()
.user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
.build()?;
let url = format!("https://html.duckduckgo.com/html/?q={}", urlencoding_encode(query));
let response = client
.get(&url)
.send()
.await?;
if !response.status().is_success() {
anyhow::bail!("Search request failed with status: {}", response.status());
}
let html = response.text().await?;
let results = parse_ddg_html(&html, max_results);
Ok(results)
}
fn parse_ddg_html(html: &str, max_results: usize) -> Vec<SearchResult> {
let mut results = Vec::new();
let _result_div_regex = Regex::new(r#"<div[^>]*class="[^"]*result[^"]*"[^>]*>(.*?)</div>\s*</div>"#).ok();
let link_regex = Regex::new(r#"<a[^>]*class="[^"]*result__a[^"]*"[^>]*href="([^"]*)"[^>]*>(.*?)</a>"#).ok();
let snippet_regex = Regex::new(r#"<a[^>]*class="[^"]*result__snippet[^"]*"[^>]*>(.*?)</a>"#).ok();
if let Some(ref link_re) = link_regex {
for cap in link_re.captures_iter(html) {
if results.len() >= max_results {
break;
}
let url = cap.get(1).map(|m| clean_url(m.as_str())).unwrap_or_default();
let title = cap.get(2).map(|m| strip_html_tags(m.as_str())).unwrap_or_default();
if url.is_empty() || title.is_empty() || url.contains("duckduckgo.com") {
continue;
}
let snippet = snippet_regex.as_ref().and_then(|snip_re| {
snip_re.captures_iter(html)
.find(|c| {
if let Some(m) = c.get(0) {
let link_pos = cap.get(0).unwrap().start();
let snip_pos = m.start();
snip_pos > link_pos && snip_pos < link_pos + 1000
} else {
false
}
})
.and_then(|c| c.get(1).map(|m| strip_html_tags(m.as_str())))
});
results.push(SearchResult {
title,
url,
snippet,
});
}
}
if results.is_empty() {
let alt_link_re = Regex::new(r#"<a[^>]*class="[^"]*result[^"]*"[^>]*href="([^"]*)"[^>]*>([^<]*)</a>"#).ok();
if let Some(re) = alt_link_re {
for cap in re.captures_iter(html) {
if results.len() >= max_results {
break;
}
let url = clean_url(cap.get(1).map(|m| m.as_str()).unwrap_or_default());
let title = cap.get(2).map(|m| strip_html_tags(m.as_str())).unwrap_or_default();
if url.is_empty() || title.is_empty() || url.contains("duckduckgo.com") {
continue;
}
results.push(SearchResult {
title,
url,
snippet: None,
});
}
}
}
results
}
fn clean_url(url: &str) -> String {
if url.contains("duckduckgo.com/l/")
&& let Some(query) = url.split("uddg=").nth(1)
&& let Some(encoded) = query.split('&').next()
&& let Ok(decoded) = urlencoding_decode(encoded) {
return decoded;
}
url.to_string()
}
fn urlencoding_encode(s: &str) -> String {
let mut result = String::new();
for c in s.chars() {
match c {
'A'..='Z' | 'a'..='z' | '0'..='9' | '-' | '_' | '.' | '~' => result.push(c),
' ' => result.push('+'),
_ => {
for byte in c.to_string().as_bytes() {
result.push_str(&format!("%{:02X}", byte));
}
}
}
}
result
}
fn urlencoding_decode(s: &str) -> Result<String> {
let decoded = urlencoding_decode_simple(s);
Ok(decoded)
}
fn urlencoding_decode_simple(s: &str) -> String {
let mut bytes: Vec<u8> = Vec::new();
let mut chars = s.chars().peekable();
while let Some(c) = chars.next() {
if c == '%' {
let hex: String = chars.by_ref().take(2).collect();
if let Ok(byte) = u8::from_str_radix(&hex, 16) {
bytes.push(byte);
} else {
bytes.push(b'%');
bytes.extend_from_slice(hex.as_bytes());
}
} else if c == '+' {
bytes.push(b' ');
} else {
let mut buf = [0u8; 4];
let encoded = c.encode_utf8(&mut buf);
bytes.extend_from_slice(encoded.as_bytes());
}
}
String::from_utf8(bytes).unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
}
fn strip_html_tags(s: &str) -> String {
let re = Regex::new(r"<[^>]*>").unwrap();
let without_tags = re.replace_all(s, "");
without_tags
.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
.replace(" ", " ")
.trim()
.to_string()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_strip_html_tags() {
assert_eq!(strip_html_tags("<b>hello</b>"), "hello");
assert_eq!(strip_html_tags("a & b"), "a & b");
assert_eq!(strip_html_tags(" <span>test</span> "), "test");
}
#[test]
fn test_urlencoding_decode() {
assert_eq!(urlencoding_decode_simple("hello%20world"), "hello world");
assert_eq!(urlencoding_decode_simple("a+b"), "a b");
assert_eq!(urlencoding_decode_simple("%3Ctest%3E"), "<test>");
}
#[test]
fn test_clean_url() {
let redirect_url = "https://duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com&rut=abc";
assert_eq!(clean_url(redirect_url), "https://example.com");
let normal_url = "https://example.com/page";
assert_eq!(clean_url(normal_url), "https://example.com/page");
}
}