Skip to main content

essence/search/
mod.rs

1use crate::{
2    engines::{http::HttpEngine, ScrapeEngine},
3    error::{Result, ScrapeError},
4    format,
5    types::{Document, ScrapeRequest, SearchResult},
6    utils::retry::{retry_with_backoff, RetryStrategy},
7};
8use scraper::{Html, Selector};
9use tracing::{info, warn};
10
11/// Search provider interface
12pub struct SearchProvider {
13    http_client: reqwest::Client,
14}
15
16impl SearchProvider {
17    pub fn new() -> Result<Self> {
18        let client = reqwest::Client::builder()
19            .user_agent("Mozilla/5.0 (compatible; Essence/0.1.0; +https://essence.foundation)")
20            .build()
21            .map_err(|e| ScrapeError::Internal(format!("Failed to build HTTP client: {}", e)))?;
22
23        Ok(Self {
24            http_client: client,
25        })
26    }
27
28    /// Search DuckDuckGo and return results
29    pub async fn search_duckduckgo(&self, query: &str, limit: u32) -> Result<Vec<SearchResult>> {
30        // Use conservative retry strategy for search (less aggressive)
31        let retry_config = RetryStrategy::Conservative.to_config();
32
33        // Wrap the search operation in retry logic
34        retry_with_backoff(
35            || async { self.search_duckduckgo_once(query, limit).await },
36            &retry_config,
37        )
38        .await
39    }
40
41    /// Perform a single DuckDuckGo search attempt without retry logic
42    async fn search_duckduckgo_once(
43        &self,
44        query: &str,
45        limit: u32,
46    ) -> Result<Vec<SearchResult>> {
47        info!("Searching DuckDuckGo for: {}", query);
48
49        // DuckDuckGo HTML search URL
50        let search_url = format!(
51            "https://html.duckduckgo.com/html/?q={}",
52            urlencoding::encode(query)
53        );
54
55        // Fetch search results page
56        let response = self
57            .http_client
58            .get(&search_url)
59            .send()
60            .await
61            .map_err(ScrapeError::RequestFailed)?;
62
63        let html_content = response
64            .text()
65            .await
66            .map_err(ScrapeError::RequestFailed)?;
67
68        // Parse HTML
69        let document = Html::parse_document(&html_content);
70
71        // DuckDuckGo HTML selectors
72        let result_selector = Selector::parse(".result").expect("valid CSS selector");
73        let title_selector = Selector::parse(".result__a").expect("valid CSS selector");
74        let snippet_selector = Selector::parse(".result__snippet").expect("valid CSS selector");
75
76        let mut results = Vec::new();
77
78        for result_elem in document.select(&result_selector) {
79            if results.len() >= limit as usize {
80                break;
81            }
82
83            // Extract title and URL
84            let title_elem = result_elem.select(&title_selector).next();
85            let snippet_elem = result_elem.select(&snippet_selector).next();
86
87            if let Some(title_node) = title_elem {
88                let title = title_node
89                    .text()
90                    .collect::<Vec<_>>()
91                    .join(" ")
92                    .trim()
93                    .to_string();
94                let url = title_node.value().attr("href").unwrap_or("").to_string();
95
96                // DuckDuckGo uses redirect URLs, extract the actual URL
97                let actual_url = extract_url_from_duckduckgo(&url);
98
99                let snippet = snippet_elem
100                    .map(|s| s.text().collect::<Vec<_>>().join(" ").trim().to_string())
101                    .unwrap_or_default();
102
103                if !actual_url.is_empty() && actual_url.starts_with("http") {
104                    results.push(SearchResult {
105                        title,
106                        url: actual_url,
107                        snippet,
108                        content: None,
109                    });
110                }
111            }
112        }
113
114        info!("Found {} search results", results.len());
115        Ok(results)
116    }
117
118    /// Scrape a search result and add content
119    pub async fn scrape_result(
120        &self,
121        mut result: SearchResult,
122        scrape_request: &ScrapeRequest,
123    ) -> SearchResult {
124        info!("Scraping search result: {}", result.url);
125
126        // Create scrape request with the result URL
127        let mut req = scrape_request.clone();
128        req.url = result.url.clone();
129
130        match self.scrape_url(&req).await {
131            Ok(document) => {
132                result.content = Some(document);
133            }
134            Err(e) => {
135                warn!("Failed to scrape {}: {}", result.url, e);
136                // Continue without content
137            }
138        }
139
140        result
141    }
142
143    /// Internal method to scrape a URL
144    async fn scrape_url(&self, request: &ScrapeRequest) -> Result<Document> {
145        let engine = HttpEngine::with_options(request.timeout, request.skip_tls_verification)?;
146        let raw_result = engine.scrape(request).await?;
147        let document = format::process_scrape_result(raw_result, request).await?;
148        Ok(document)
149    }
150}
151
152impl Default for SearchProvider {
153    fn default() -> Self {
154        Self::new().expect("Failed to create default search provider")
155    }
156}
157
158/// Extract the actual URL from DuckDuckGo's redirect URL
159fn extract_url_from_duckduckgo(url: &str) -> String {
160    // DuckDuckGo uses URLs like: //duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com
161    if url.starts_with("//duckduckgo.com/l/?") {
162        // Parse query parameters
163        if let Some(query_start) = url.find('?') {
164            let query = &url[query_start + 1..];
165            for param in query.split('&') {
166                if let Some(eq_pos) = param.find('=') {
167                    let key = &param[..eq_pos];
168                    let value = &param[eq_pos + 1..];
169                    if key == "uddg" {
170                        return urlencoding::decode(value).unwrap_or_default().to_string();
171                    }
172                }
173            }
174        }
175    }
176
177    url.to_string()
178}
179
180// We need to add urlencoding to Cargo.toml for URL encoding
181// For now, let's use a simple implementation
182
183mod urlencoding {
184    pub fn encode(s: &str) -> String {
185        percent_encoding::utf8_percent_encode(s, percent_encoding::NON_ALPHANUMERIC).to_string()
186    }
187
188    pub fn decode(s: &str) -> Result<String, std::str::Utf8Error> {
189        percent_encoding::percent_decode_str(s)
190            .decode_utf8()
191            .map(|s| s.to_string())
192    }
193}
194
195#[cfg(test)]
196mod tests {
197    use super::*;
198
199    #[test]
200    fn test_extract_url_from_duckduckgo() {
201        let ddg_url = "//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fpage";
202        let result = extract_url_from_duckduckgo(ddg_url);
203        assert_eq!(result, "https://example.com/page");
204    }
205
206    #[test]
207    fn test_extract_url_passthrough() {
208        let normal_url = "https://example.com";
209        let result = extract_url_from_duckduckgo(normal_url);
210        assert_eq!(result, normal_url);
211    }
212}