1use crate::{
2 engines::{http::HttpEngine, ScrapeEngine},
3 error::{Result, ScrapeError},
4 format,
5 types::{Document, ScrapeRequest, SearchResult},
6 utils::retry::{retry_with_backoff, RetryStrategy},
7};
8use scraper::{Html, Selector};
9use tracing::{info, warn};
10
11pub struct SearchProvider {
13 http_client: reqwest::Client,
14}
15
16impl SearchProvider {
17 pub fn new() -> Result<Self> {
18 let client = reqwest::Client::builder()
19 .user_agent("Mozilla/5.0 (compatible; Essence/0.1.0; +https://essence.foundation)")
20 .build()
21 .map_err(|e| ScrapeError::Internal(format!("Failed to build HTTP client: {}", e)))?;
22
23 Ok(Self {
24 http_client: client,
25 })
26 }
27
28 pub async fn search_duckduckgo(&self, query: &str, limit: u32) -> Result<Vec<SearchResult>> {
30 let retry_config = RetryStrategy::Conservative.to_config();
32
33 retry_with_backoff(
35 || async { self.search_duckduckgo_once(query, limit).await },
36 &retry_config,
37 )
38 .await
39 }
40
41 async fn search_duckduckgo_once(
43 &self,
44 query: &str,
45 limit: u32,
46 ) -> Result<Vec<SearchResult>> {
47 info!("Searching DuckDuckGo for: {}", query);
48
49 let search_url = format!(
51 "https://html.duckduckgo.com/html/?q={}",
52 urlencoding::encode(query)
53 );
54
55 let response = self
57 .http_client
58 .get(&search_url)
59 .send()
60 .await
61 .map_err(ScrapeError::RequestFailed)?;
62
63 let html_content = response
64 .text()
65 .await
66 .map_err(ScrapeError::RequestFailed)?;
67
68 let document = Html::parse_document(&html_content);
70
71 let result_selector = Selector::parse(".result").expect("valid CSS selector");
73 let title_selector = Selector::parse(".result__a").expect("valid CSS selector");
74 let snippet_selector = Selector::parse(".result__snippet").expect("valid CSS selector");
75
76 let mut results = Vec::new();
77
78 for result_elem in document.select(&result_selector) {
79 if results.len() >= limit as usize {
80 break;
81 }
82
83 let title_elem = result_elem.select(&title_selector).next();
85 let snippet_elem = result_elem.select(&snippet_selector).next();
86
87 if let Some(title_node) = title_elem {
88 let title = title_node
89 .text()
90 .collect::<Vec<_>>()
91 .join(" ")
92 .trim()
93 .to_string();
94 let url = title_node.value().attr("href").unwrap_or("").to_string();
95
96 let actual_url = extract_url_from_duckduckgo(&url);
98
99 let snippet = snippet_elem
100 .map(|s| s.text().collect::<Vec<_>>().join(" ").trim().to_string())
101 .unwrap_or_default();
102
103 if !actual_url.is_empty() && actual_url.starts_with("http") {
104 results.push(SearchResult {
105 title,
106 url: actual_url,
107 snippet,
108 content: None,
109 });
110 }
111 }
112 }
113
114 info!("Found {} search results", results.len());
115 Ok(results)
116 }
117
118 pub async fn scrape_result(
120 &self,
121 mut result: SearchResult,
122 scrape_request: &ScrapeRequest,
123 ) -> SearchResult {
124 info!("Scraping search result: {}", result.url);
125
126 let mut req = scrape_request.clone();
128 req.url = result.url.clone();
129
130 match self.scrape_url(&req).await {
131 Ok(document) => {
132 result.content = Some(document);
133 }
134 Err(e) => {
135 warn!("Failed to scrape {}: {}", result.url, e);
136 }
138 }
139
140 result
141 }
142
143 async fn scrape_url(&self, request: &ScrapeRequest) -> Result<Document> {
145 let engine = HttpEngine::with_options(request.timeout, request.skip_tls_verification)?;
146 let raw_result = engine.scrape(request).await?;
147 let document = format::process_scrape_result(raw_result, request).await?;
148 Ok(document)
149 }
150}
151
152impl Default for SearchProvider {
153 fn default() -> Self {
154 Self::new().expect("Failed to create default search provider")
155 }
156}
157
158fn extract_url_from_duckduckgo(url: &str) -> String {
160 if url.starts_with("//duckduckgo.com/l/?") {
162 if let Some(query_start) = url.find('?') {
164 let query = &url[query_start + 1..];
165 for param in query.split('&') {
166 if let Some(eq_pos) = param.find('=') {
167 let key = ¶m[..eq_pos];
168 let value = ¶m[eq_pos + 1..];
169 if key == "uddg" {
170 return urlencoding::decode(value).unwrap_or_default().to_string();
171 }
172 }
173 }
174 }
175 }
176
177 url.to_string()
178}
179
180mod urlencoding {
184 pub fn encode(s: &str) -> String {
185 percent_encoding::utf8_percent_encode(s, percent_encoding::NON_ALPHANUMERIC).to_string()
186 }
187
188 pub fn decode(s: &str) -> Result<String, std::str::Utf8Error> {
189 percent_encoding::percent_decode_str(s)
190 .decode_utf8()
191 .map(|s| s.to_string())
192 }
193}
194
195#[cfg(test)]
196mod tests {
197 use super::*;
198
199 #[test]
200 fn test_extract_url_from_duckduckgo() {
201 let ddg_url = "//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fpage";
202 let result = extract_url_from_duckduckgo(ddg_url);
203 assert_eq!(result, "https://example.com/page");
204 }
205
206 #[test]
207 fn test_extract_url_passthrough() {
208 let normal_url = "https://example.com";
209 let result = extract_url_from_duckduckgo(normal_url);
210 assert_eq!(result, normal_url);
211 }
212}