use super::utils::{percent_decode, truncate_chars, urlencode};
use super::{SearchProvider, SearchResult};
use crate::error::{Result, ToolError};
use async_trait::async_trait;
use reqwest::Client;
use scraper::{Html, Selector};
use serde::Deserialize;
use std::time::Duration;
use tracing::warn;
pub struct DuckDuckGoProvider {
client: Client,
}
impl DuckDuckGoProvider {
pub fn new() -> Self {
let client = Client::builder()
.user_agent(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/131.0.0.0 Safari/537.36",
)
.timeout(Duration::from_secs(15))
.build()
.unwrap_or_else(|_| Client::new());
Self { client }
}
fn extract_url(href: &str) -> String {
let search = "?uddg=";
if let Some(pos) = href.find(search) {
let encoded = &href[pos + search.len()..];
let encoded = encoded.split('&').next().unwrap_or(encoded);
return percent_decode(encoded);
}
let search = "&uddg=";
if let Some(pos) = href.find(search) {
let encoded = &href[pos + search.len()..];
let encoded = encoded.split('&').next().unwrap_or(encoded);
return percent_decode(encoded);
}
if href.starts_with("//") {
return format!("https:{}", href);
}
href.to_string()
}
}
impl Default for DuckDuckGoProvider {
fn default() -> Self {
Self::new()
}
}
#[async_trait]
impl SearchProvider for DuckDuckGoProvider {
fn name(&self) -> &str {
"duckduckgo"
}
async fn search(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
let encoded_query = urlencode(query);
let url = format!("https://html.duckduckgo.com/html/?q={}", encoded_query);
let response = self
.client
.get(&url)
.header("Accept", "text/html,application/xhtml+xml")
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
.send()
.await
.map_err(|e| ToolError::ExecutionFailed {
tool: "web_search".into(),
message: format!("DuckDuckGo 请求失败: {}", e),
})?;
if !response.status().is_success() {
return Err(ToolError::ExecutionFailed {
tool: "web_search".into(),
message: format!("DuckDuckGo 返回错误状态: {}", response.status()),
}
.into());
}
let html = response
.text()
.await
.map_err(|e| ToolError::ExecutionFailed {
tool: "web_search".into(),
message: format!("读取响应体失败: {}", e),
})?;
if html.contains("anomaly-modal") || html.contains("bots use DuckDuckGo") {
warn!("DuckDuckGo HTML 搜索被反爬拦截,降级到 Instant Answer API");
return self.search_via_api(query, max_results).await;
}
let results = parse_ddg_html(&html, max_results)?;
if !results.is_empty() {
return Ok(results);
}
warn!("DuckDuckGo HTML 未解析到结果,降级到 Instant Answer API");
self.search_via_api(query, max_results).await
}
}
impl DuckDuckGoProvider {
async fn search_via_api(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
let url = format!(
"https://api.duckduckgo.com/?q={}&format=json&no_html=1",
urlencode(query)
);
let response =
self.client
.get(&url)
.send()
.await
.map_err(|e| ToolError::ExecutionFailed {
tool: "web_search".into(),
message: format!("DuckDuckGo API 请求失败: {}", e),
})?;
if !response.status().is_success() {
return Err(ToolError::ExecutionFailed {
tool: "web_search".into(),
message: format!("DuckDuckGo API 返回错误: {}", response.status()),
}
.into());
}
let api_resp: DdgApiResponse =
response
.json()
.await
.map_err(|e| ToolError::ExecutionFailed {
tool: "web_search".into(),
message: format!("DuckDuckGo API 响应解析失败: {}", e),
})?;
let mut results = Vec::new();
if !api_resp.abstract_text.is_empty() && !api_resp.abstract_url.is_empty() {
results.push(SearchResult {
title: api_resp
.heading
.or_else(|| Some(query.to_string()))
.unwrap(),
url: api_resp.abstract_url,
snippet: truncate_chars(&api_resp.abstract_text, 300),
});
}
for topic in &api_resp.related_topics {
if results.len() >= max_results {
break;
}
if let Some(rt) = topic.as_object() {
let text = rt.get("Text").and_then(|v| v.as_str()).unwrap_or("");
let url = rt.get("FirstURL").and_then(|v| v.as_str()).unwrap_or("");
if !text.is_empty() && !url.is_empty() {
let title = text.split(" - ").next().unwrap_or(text).to_string();
results.push(SearchResult {
title,
url: url.to_string(),
snippet: truncate_chars(text, 300),
});
}
}
}
Ok(results)
}
}
#[derive(Debug, Deserialize)]
struct DdgApiResponse {
#[serde(rename = "AbstractText")]
abstract_text: String,
#[serde(rename = "AbstractURL")]
abstract_url: String,
#[serde(rename = "Heading")]
heading: Option<String>,
#[serde(rename = "RelatedTopics")]
related_topics: Vec<serde_json::Value>,
}
fn parse_ddg_html(html: &str, max_results: usize) -> Result<Vec<SearchResult>> {
let document = Html::parse_document(html);
let mut results = Vec::new();
let result_selectors = [".result", ".web-result", ".results_links"];
let title_selectors = ["a.result__a", "a.result__title", "h2 a"];
let snippet_selectors = [".result__snippet", "td.result__snippet", ".snippet"];
for result_sel in &result_selectors {
let Ok(selector) = Selector::parse(result_sel) else {
continue;
};
for element in document.select(&selector) {
if results.len() >= max_results {
break;
}
let (title_text, url) = extract_title_and_url(&element, &title_selectors);
if title_text.is_empty() || url.is_empty() {
continue;
}
if url.contains("duckduckgo.com") && !url.contains("uddg=") {
continue;
}
let snippet_text = extract_snippet(&element, &snippet_selectors);
results.push(SearchResult {
title: title_text,
url,
snippet: snippet_text,
});
}
if !results.is_empty() {
break;
}
}
Ok(results)
}
fn extract_title_and_url(
element: &scraper::ElementRef,
title_selectors: &[&str],
) -> (String, String) {
for sel_str in title_selectors {
let Ok(selector) = Selector::parse(sel_str) else {
continue;
};
if let Some(a) = element.select(&selector).next() {
let text = a.text().collect::<String>().trim().to_string();
let href = a.value().attr("href").unwrap_or("");
return (text, DuckDuckGoProvider::extract_url(href));
}
}
(String::new(), String::new())
}
fn extract_snippet(element: &scraper::ElementRef, snippet_selectors: &[&str]) -> String {
for sel_str in snippet_selectors {
let Ok(selector) = Selector::parse(sel_str) else {
continue;
};
if let Some(s) = element.select(&selector).next() {
let text = s.text().collect::<String>().trim().to_string();
if !text.is_empty() {
return text;
}
}
}
String::new()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_url_redirect() {
let href = "//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com&rut=abc";
assert_eq!(DuckDuckGoProvider::extract_url(href), "https://example.com");
}
#[test]
fn test_extract_url_no_false_positive() {
let href = "https://example.com/page?foo=bar";
assert_eq!(
DuckDuckGoProvider::extract_url(href),
"https://example.com/page?foo=bar"
);
}
#[test]
fn test_extract_url_amp_uddg() {
let href = "//duckduckgo.com/l/?foo=1&uddg=https%3A%2F%2Fexample.com%2Fpage&rut=abc";
assert_eq!(
DuckDuckGoProvider::extract_url(href),
"https://example.com/page"
);
}
#[test]
fn test_extract_url_protocol_relative() {
assert_eq!(
DuckDuckGoProvider::extract_url("//example.com/page"),
"https://example.com/page"
);
}
#[test]
fn test_detect_captcha() {
let html =
r#"<html><body><div class="anomaly-modal">bots use DuckDuckGo too</div></body></html>"#;
assert!(html.contains("anomaly-modal"));
}
#[test]
fn test_parse_ddg_html_with_results() {
let html = r#"
<html><body>
<div class="result">
<a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.rust-lang.org%2F&rut=abc">Rust Programming Language</a>
<a class="result__snippet">A language empowering everyone to build reliable and efficient software.</a>
</div>
</body></html>"#;
let results = parse_ddg_html(html, 10).unwrap();
assert_eq!(results.len(), 1);
assert_eq!(results[0].title, "Rust Programming Language");
assert_eq!(results[0].url, "https://www.rust-lang.org/");
}
#[test]
fn test_parse_ddg_html_empty() {
let html = "<html><body><p>No results</p></body></html>";
let results = parse_ddg_html(html, 10).unwrap();
assert!(results.is_empty());
}
}