use anyhow::{Context, Result};
use reqwest::Client;
use crate::tools::websearch::parser::{clean_url, SearchResult, SearchResultParser, strip_html_tags};
pub struct DuckDuckGoHtmlParser;
impl SearchResultParser for DuckDuckGoHtmlParser {
fn parse(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
let mut results = Vec::new();
let link_regex = regex::Regex::new(
r#"<a[^>]*class="[^"]*result__a[^"]*"[^>]*href="([^"]*)"[^>]*>(.*?)</a>"#
).ok();
let snippet_regex = regex::Regex::new(
r#"<a[^>]*class="[^"]*result__snippet[^"]*"[^>]*>(.*?)</a>"#
).ok();
if let Some(ref link_re) = link_regex {
for cap in link_re.captures_iter(html) {
if results.len() >= max_results {
break;
}
let url = cap.get(1)
.map(|m| clean_url(m.as_str()))
.unwrap_or_default();
let title = cap.get(2)
.map(|m| strip_html_tags(m.as_str()))
.unwrap_or_default();
if url.is_empty() || title.is_empty() || url.contains("duckduckgo.com") {
continue;
}
let snippet = snippet_regex.as_ref().and_then(|snip_re| {
snip_re.captures_iter(html)
.find(|c| {
if let Some(m) = c.get(0) {
let link_pos = cap.get(0).unwrap().start();
let snip_pos = m.start();
snip_pos > link_pos && snip_pos < link_pos + 1000
} else {
false
}
})
.and_then(|c| c.get(1).map(|m| strip_html_tags(m.as_str())))
});
results.push(SearchResult { title, url, snippet });
}
}
if results.is_empty() {
fallback_parse(html, max_results, &mut results);
}
results
}
}
pub struct DuckDuckGoLiteParser;
impl SearchResultParser for DuckDuckGoLiteParser {
fn parse(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
let mut results = Vec::new();
let link_re = regex::Regex::new(
r#"<a[^>]*rel="nofollow"[^>]*href="([^"]*)"[^>]*>([^<]+)</a>"#
).ok();
if let Some(re) = link_re {
for cap in re.captures_iter(html) {
if results.len() >= max_results {
break;
}
let url = clean_url(cap.get(1).map(|m| m.as_str()).unwrap_or_default());
let title = strip_html_tags(cap.get(2).map(|m| m.as_str()).unwrap_or_default());
if url.is_empty() || title.is_empty() || url.contains("duckduckgo.com") {
continue;
}
results.push(SearchResult { title, url, snippet: None });
}
}
results
}
}
fn fallback_parse(html: &str, max_results: usize, results: &mut Vec<SearchResult>) {
let alt_link_re = regex::Regex::new(
r#"<a[^>]*class="[^"]*result[^"]*"[^>]*href="([^"]*)"[^>]*>([^<]*)</a>"#
).ok();
if let Some(re) = alt_link_re {
for cap in re.captures_iter(html) {
if results.len() >= max_results {
break;
}
let url = clean_url(cap.get(1).map(|m| m.as_str()).unwrap_or_default());
let title = cap.get(2)
.map(|m| strip_html_tags(m.as_str()))
.unwrap_or_default();
if url.is_empty() || title.is_empty() || url.contains("duckduckgo.com") {
continue;
}
results.push(SearchResult { title, url, snippet: None });
}
}
}
pub async fn search(client: &Client, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
if let Ok(results) = search_lite(client, query, max_results).await
&& !results.is_empty() {
return Ok(results);
}
let url = format!(
"https://html.duckduckgo.com/html/?q={}",
urlencoding::encode(query)
);
let response = client
.get(&url)
.send()
.await
.with_context(|| "DuckDuckGo request failed")?;
if !response.status().is_success() {
anyhow::bail!("DuckDuckGo returned status: {}", response.status());
}
let html = response.text().await
.with_context(|| "Failed to read DuckDuckGo response")?;
Ok(DuckDuckGoHtmlParser.parse(&html, max_results))
}
pub async fn search_lite(client: &Client, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
let url = format!(
"https://lite.duckduckgo.com/lite/?q={}",
urlencoding::encode(query)
);
let response = client
.get(&url)
.send()
.await
.with_context(|| "DuckDuckGo Lite request failed")?;
if !response.status().is_success() {
anyhow::bail!("DuckDuckGo Lite returned status: {}", response.status());
}
let html = response.text().await
.with_context(|| "Failed to read DuckDuckGo Lite response")?;
Ok(DuckDuckGoLiteParser.parse(&html, max_results))
}