Skip to main content

web_capture/
search.rs

1//! Structured search-provider capture (issue #130).
2//!
3//! Turns a query + provider into a normalized, machine-readable result set so
4//! that browser, CLI, and server callers all consume one consistent contract
5//! instead of each reimplementing provider-specific scraping. Server-side and
6//! CLI callers fetch provider pages directly (no CORS restriction), so this
7//! module defaults to the `fetch` capture mode. Providers that expose a native
8//! CORS/JSON API (Wikipedia) are preferred; HTML search engines are parsed
9//! best-effort and report CAPTCHA/blocking through `diagnostics`.
10//!
11//! Normalized result shape (camelCase JSON):
12//! ```json
13//! {
14//!   "query": "...", "provider": "...", "captureMode": "fetch",
15//!   "capturedAt": "2026-05-18T20:30:00Z",
16//!   "results": [{ "rank": 1, "title": "...", "url": "...", "snippet": "..." }],
17//!   "diagnostics": { "status": 200, "blockedByCors": false,
18//!                    "blockedByCaptcha": false, "sourceUrl": "..." }
19//! }
20//! ```
21
22use scraper::{Html, Selector};
23use serde::{Deserialize, Serialize};
24use url::form_urlencoded::byte_serialize;
25
26const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
27
28/// Providers understood by the search contract.
29pub const SEARCH_PROVIDERS: [&str; 5] = ["wikipedia", "duckduckgo", "google", "bing", "brave"];
30
31/// Default provider when none is supplied.
32pub const DEFAULT_PROVIDER: &str = "wikipedia";
33
34/// Default number of results requested/returned.
35pub const DEFAULT_LIMIT: usize = 10;
36
37/// A single normalized search result.
38#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
39pub struct SearchResultItem {
40    pub rank: usize,
41    pub title: String,
42    pub url: String,
43    pub snippet: String,
44}
45
46/// Structured diagnostics describing how the capture went.
47#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
48#[serde(rename_all = "camelCase")]
49pub struct SearchDiagnostics {
50    pub status: u16,
51    pub blocked_by_cors: bool,
52    pub blocked_by_captcha: bool,
53    pub source_url: String,
54    #[serde(skip_serializing_if = "Option::is_none")]
55    pub error: Option<String>,
56}
57
58/// The full normalized search capture result.
59#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
60#[serde(rename_all = "camelCase")]
61pub struct SearchResult {
62    pub query: String,
63    pub provider: String,
64    pub capture_mode: String,
65    pub captured_at: String,
66    pub results: Vec<SearchResultItem>,
67    pub diagnostics: SearchDiagnostics,
68}
69
70/// Returns true if `provider` is one of the supported providers.
71#[must_use]
72pub fn is_supported_provider(provider: &str) -> bool {
73    SEARCH_PROVIDERS.contains(&provider)
74}
75
76/// Normalize whitespace and decode basic HTML entities in extracted text.
77fn clean_text(text: &str) -> String {
78    // `scraper` already returns decoded text nodes, but snippets assembled from
79    // multiple nodes can carry stray whitespace; collapse it to a single line.
80    text.split_whitespace().collect::<Vec<_>>().join(" ")
81}
82
83/// Build the provider-native source URL for a query.
84///
85/// # Errors
86///
87/// Returns an error string when `provider` is not supported.
88pub fn build_search_url(provider: &str, query: &str, limit: usize) -> Result<String, String> {
89    let q: String = byte_serialize(query.as_bytes()).collect();
90    match provider {
91        "wikipedia" => Ok(format!(
92            "https://en.wikipedia.org/w/rest.php/v1/search/page?q={q}&limit={limit}"
93        )),
94        "duckduckgo" => Ok(format!("https://html.duckduckgo.com/html/?q={q}")),
95        "google" => Ok(format!("https://www.google.com/search?q={q}&num={limit}")),
96        "bing" => Ok(format!("https://www.bing.com/search?q={q}&count={limit}")),
97        "brave" => Ok(format!("https://search.brave.com/search?q={q}")),
98        other => Err(format!(
99            "Unknown search provider \"{other}\". Supported: {}",
100            SEARCH_PROVIDERS.join(", ")
101        )),
102    }
103}
104
105/// Detect provider CAPTCHA / bot-block interstitials in an HTML body.
106#[must_use]
107pub fn looks_like_captcha(html: &str) -> bool {
108    let lower = html.to_lowercase();
109    lower.contains("captcha")
110        || lower.contains("unusual traffic")
111        || lower.contains("are you a robot")
112        || lower.contains("/sorry/index")
113        || lower.contains("automated queries")
114}
115
116/// Decode a `DuckDuckGo` redirect href (`//duckduckgo.com/l/?uddg=...`).
117fn resolve_duckduckgo_href(href: &str) -> String {
118    if href.is_empty() {
119        return String::new();
120    }
121    let normalized = href
122        .strip_prefix("//")
123        .map_or_else(|| href.to_string(), |stripped| format!("https:{stripped}"));
124    if let Ok(parsed) = url::Url::parse(&normalized) {
125        if let Some((_, value)) = parsed.query_pairs().find(|(k, _)| k == "uddg") {
126            return value.into_owned();
127        }
128        return parsed.to_string();
129    }
130    href.to_string()
131}
132
133/// Wikipedia REST search page entry.
134#[derive(Debug, Deserialize)]
135struct WikiPage {
136    key: Option<String>,
137    title: Option<String>,
138    excerpt: Option<String>,
139    description: Option<String>,
140}
141
142#[derive(Debug, Deserialize)]
143struct WikiResponse {
144    pages: Option<Vec<WikiPage>>,
145}
146
147fn strip_tags(input: &str) -> String {
148    let mut out = String::with_capacity(input.len());
149    let mut in_tag = false;
150    for c in input.chars() {
151        match c {
152            '<' => in_tag = true,
153            '>' => in_tag = false,
154            _ if !in_tag => out.push(c),
155            _ => {}
156        }
157    }
158    out
159}
160
161fn parse_wikipedia(body: &str, limit: usize) -> Vec<SearchResultItem> {
162    let parsed: WikiResponse = match serde_json::from_str(body) {
163        Ok(value) => value,
164        Err(_) => return Vec::new(),
165    };
166    let pages = parsed.pages.unwrap_or_default();
167    pages
168        .into_iter()
169        .take(limit)
170        .enumerate()
171        .map(|(i, page)| {
172            let key = page
173                .key
174                .clone()
175                .or_else(|| page.title.clone())
176                .unwrap_or_default();
177            let title = clean_text(&page.title.or(page.key).unwrap_or_default());
178            let snippet_raw = page.excerpt.or(page.description).unwrap_or_default();
179            let snippet = clean_text(&strip_tags(&snippet_raw));
180            let encoded: String = byte_serialize(key.as_bytes()).collect();
181            SearchResultItem {
182                rank: i + 1,
183                title,
184                url: format!("https://en.wikipedia.org/wiki/{encoded}"),
185                snippet,
186            }
187        })
188        .collect()
189}
190
191/// Extract trimmed text content of the first element matching `selector`.
192fn first_text(element: &scraper::ElementRef, selector: &Selector) -> String {
193    element
194        .select(selector)
195        .next()
196        .map(|el| clean_text(&el.text().collect::<String>()))
197        .unwrap_or_default()
198}
199
200fn parse_duckduckgo(doc: &Html, limit: usize) -> Vec<SearchResultItem> {
201    let body_sel = Selector::parse(".result__body").unwrap();
202    let web_sel = Selector::parse(".web-result").unwrap();
203    let anchor_sel = Selector::parse("a.result__a").unwrap();
204    let snippet_sel = Selector::parse(".result__snippet").unwrap();
205
206    let mut containers: Vec<_> = doc.select(&body_sel).collect();
207    if containers.is_empty() {
208        containers = doc.select(&web_sel).collect();
209    }
210
211    let mut results = Vec::new();
212    for el in containers {
213        if results.len() >= limit {
214            break;
215        }
216        if let Some(anchor) = el.select(&anchor_sel).next() {
217            let title = clean_text(&anchor.text().collect::<String>());
218            let url = resolve_duckduckgo_href(anchor.value().attr("href").unwrap_or_default());
219            let snippet = first_text(&el, &snippet_sel);
220            if !title.is_empty() && !url.is_empty() {
221                results.push(SearchResultItem {
222                    rank: results.len() + 1,
223                    title,
224                    url,
225                    snippet,
226                });
227            }
228        }
229    }
230    results
231}
232
233fn parse_google(doc: &Html, limit: usize) -> Vec<SearchResultItem> {
234    let block_sel = Selector::parse("div.g, div.tF2Cxc, div.MjjYud").unwrap();
235    let anchor_sel = Selector::parse("a[href^=\"http\"]").unwrap();
236    let title_sel = Selector::parse("h3").unwrap();
237    let snippet_sel = Selector::parse("div[data-sncf], .VwiC3b, .IsZvec").unwrap();
238
239    let mut results = Vec::new();
240    for el in doc.select(&block_sel) {
241        if results.len() >= limit {
242            break;
243        }
244        let url = el
245            .select(&anchor_sel)
246            .next()
247            .and_then(|a| a.value().attr("href"))
248            .unwrap_or_default()
249            .to_string();
250        let title = first_text(&el, &title_sel);
251        let snippet = first_text(&el, &snippet_sel);
252        if !title.is_empty() && !url.is_empty() {
253            results.push(SearchResultItem {
254                rank: results.len() + 1,
255                title,
256                url,
257                snippet,
258            });
259        }
260    }
261    results
262}
263
264fn parse_bing(doc: &Html, limit: usize) -> Vec<SearchResultItem> {
265    let block_sel = Selector::parse("li.b_algo").unwrap();
266    let anchor_sel = Selector::parse("h2 a").unwrap();
267    let snippet_sel = Selector::parse(".b_caption p, p").unwrap();
268
269    let mut results = Vec::new();
270    for el in doc.select(&block_sel) {
271        if results.len() >= limit {
272            break;
273        }
274        if let Some(anchor) = el.select(&anchor_sel).next() {
275            let title = clean_text(&anchor.text().collect::<String>());
276            let url = anchor.value().attr("href").unwrap_or_default().to_string();
277            let snippet = first_text(&el, &snippet_sel);
278            if !title.is_empty() && !url.is_empty() {
279                results.push(SearchResultItem {
280                    rank: results.len() + 1,
281                    title,
282                    url,
283                    snippet,
284                });
285            }
286        }
287    }
288    results
289}
290
291fn parse_brave(doc: &Html, limit: usize) -> Vec<SearchResultItem> {
292    let block_sel = Selector::parse("div.snippet").unwrap();
293    let anchor_sel = Selector::parse("a[href^=\"http\"]").unwrap();
294    let title_sel = Selector::parse(".snippet-title, .title").unwrap();
295    let snippet_sel = Selector::parse(".snippet-description, .snippet-content").unwrap();
296
297    let mut results = Vec::new();
298    for el in doc.select(&block_sel) {
299        if results.len() >= limit {
300            break;
301        }
302        let anchor = el.select(&anchor_sel).next();
303        let url = anchor
304            .and_then(|a| a.value().attr("href"))
305            .unwrap_or_default()
306            .to_string();
307        let mut title = first_text(&el, &title_sel);
308        if title.is_empty() {
309            if let Some(a) = anchor {
310                title = clean_text(&a.text().collect::<String>());
311            }
312        }
313        let snippet = first_text(&el, &snippet_sel);
314        if !title.is_empty() && !url.is_empty() {
315            results.push(SearchResultItem {
316                rank: results.len() + 1,
317                title,
318                url,
319                snippet,
320            });
321        }
322    }
323    results
324}
325
326/// Parse a provider response body into normalized result rows.
327///
328/// Pure function (no network) so it can be unit-tested against fixtures.
329/// Returns the parsed rows and whether the body looked like a CAPTCHA wall.
330#[must_use]
331pub fn parse_search_results(
332    provider: &str,
333    body: &str,
334    limit: usize,
335) -> (Vec<SearchResultItem>, bool) {
336    if provider == "wikipedia" {
337        return (parse_wikipedia(body, limit), false);
338    }
339    let blocked = looks_like_captcha(body);
340    let doc = Html::parse_document(body);
341    let results = match provider {
342        "duckduckgo" => parse_duckduckgo(&doc, limit),
343        "google" => parse_google(&doc, limit),
344        "bing" => parse_bing(&doc, limit),
345        "brave" => parse_brave(&doc, limit),
346        _ => Vec::new(),
347    };
348    (results, blocked)
349}
350
351/// Render a normalized search result as Markdown.
352#[must_use]
353pub fn format_search_as_markdown(result: &SearchResult) -> String {
354    let mut lines = Vec::new();
355    lines.push(format!("# Search results for \"{}\"", result.query));
356    lines.push(String::new());
357    lines.push(format!("- Provider: `{}`", result.provider));
358    lines.push(format!("- Capture mode: `{}`", result.capture_mode));
359    lines.push(format!("- Captured at: {}", result.captured_at));
360    lines.push(format!("- Source: {}", result.diagnostics.source_url));
361    if result.diagnostics.blocked_by_captcha {
362        lines.push("- ⚠️ Provider returned a CAPTCHA / bot-block page.".to_string());
363    }
364    lines.push(String::new());
365    if result.results.is_empty() {
366        lines.push("_No results._".to_string());
367        return lines.join("\n");
368    }
369    for item in &result.results {
370        lines.push(format!("{}. [{}]({})", item.rank, item.title, item.url));
371        if !item.snippet.is_empty() {
372            lines.push(format!("   {}", item.snippet));
373        }
374    }
375    lines.join("\n")
376}
377
378/// Capture structured search results for a query from a provider.
379///
380/// `captured_at` is injected (RFC 3339 timestamp) so the result is
381/// deterministic for callers and tests. A transport failure is recorded in
382/// `diagnostics` rather than returned as an error, mirroring the JS contract.
383///
384/// # Errors
385///
386/// Returns an error string for an empty query or unsupported provider.
387pub async fn search(
388    query: &str,
389    provider: &str,
390    limit: usize,
391    capture_mode: &str,
392    captured_at: &str,
393) -> Result<SearchResult, String> {
394    if query.trim().is_empty() {
395        return Err("Missing `query` parameter".to_string());
396    }
397    if !is_supported_provider(provider) {
398        return Err(format!(
399            "Unknown search provider \"{provider}\". Supported: {}",
400            SEARCH_PROVIDERS.join(", ")
401        ));
402    }
403
404    let source_url = build_search_url(provider, query, limit)?;
405    let mut diagnostics = SearchDiagnostics {
406        status: 0,
407        blocked_by_cors: false,
408        blocked_by_captcha: false,
409        source_url: source_url.clone(),
410        error: None,
411    };
412    let mut results = Vec::new();
413
414    let accept = if provider == "wikipedia" {
415        "application/json"
416    } else {
417        "text/html,application/xhtml+xml"
418    };
419
420    match reqwest::Client::builder().user_agent(USER_AGENT).build() {
421        Ok(client) => {
422            match client
423                .get(&source_url)
424                .header("Accept", accept)
425                .header("Accept-Language", "en-US,en;q=0.9")
426                .send()
427                .await
428            {
429                Ok(response) => {
430                    diagnostics.status = response.status().as_u16();
431                    match response.text().await {
432                        Ok(body) => {
433                            let (parsed, blocked) = parse_search_results(provider, &body, limit);
434                            results = parsed;
435                            diagnostics.blocked_by_captcha = blocked;
436                        }
437                        Err(e) => diagnostics.error = Some(e.to_string()),
438                    }
439                }
440                Err(e) => diagnostics.error = Some(e.to_string()),
441            }
442        }
443        Err(e) => diagnostics.error = Some(e.to_string()),
444    }
445
446    Ok(SearchResult {
447        query: query.to_string(),
448        provider: provider.to_string(),
449        capture_mode: capture_mode.to_string(),
450        captured_at: captured_at.to_string(),
451        results,
452        diagnostics,
453    })
454}
455
456#[cfg(test)]
457mod tests {
458    use super::*;
459
460    const WIKI_JSON: &str = r#"{"pages":[
461        {"id":1,"key":"Formal_methods","title":"Formal methods","excerpt":"the <span>study</span> of <b>formal</b>","description":"rigorous"},
462        {"id":2,"key":"Formal_system","title":"Formal system","excerpt":"an abstract structure","description":""}
463    ]}"#;
464
465    const DDG_HTML: &str = r#"
466        <div class="result__body">
467          <a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fa&rut=abc">First &amp; Best</a>
468          <div class="result__snippet">Snippet about the <b>first</b> result</div>
469        </div>
470        <div class="result__body">
471          <a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.org%2Fb">Second result</a>
472          <div class="result__snippet">Snippet two</div>
473        </div>
474    "#;
475
476    const BING_HTML: &str = r#"
477        <ol id="b_results">
478          <li class="b_algo">
479            <h2><a href="https://bing-result.example/1">Bing One</a></h2>
480            <div class="b_caption"><p>Bing snippet one</p></div>
481          </li>
482        </ol>
483    "#;
484
485    #[test]
486    fn builds_wikipedia_url() {
487        assert_eq!(
488            build_search_url("wikipedia", "formal", 5).unwrap(),
489            "https://en.wikipedia.org/w/rest.php/v1/search/page?q=formal&limit=5"
490        );
491    }
492
493    #[test]
494    fn rejects_unknown_provider_url() {
495        assert!(build_search_url("yahoo", "x", 5).is_err());
496    }
497
498    #[test]
499    fn parses_wikipedia_json() {
500        let (results, blocked) = parse_search_results("wikipedia", WIKI_JSON, 10);
501        assert!(!blocked);
502        assert_eq!(results.len(), 2);
503        assert_eq!(results[0].title, "Formal methods");
504        assert_eq!(
505            results[0].url,
506            "https://en.wikipedia.org/wiki/Formal_methods"
507        );
508        assert_eq!(results[0].snippet, "the study of formal");
509        assert_eq!(
510            results[1].url,
511            "https://en.wikipedia.org/wiki/Formal_system"
512        );
513    }
514
515    #[test]
516    fn respects_limit() {
517        let (results, _) = parse_search_results("wikipedia", WIKI_JSON, 1);
518        assert_eq!(results.len(), 1);
519    }
520
521    #[test]
522    fn parses_duckduckgo_and_decodes_redirects() {
523        let (results, _) = parse_search_results("duckduckgo", DDG_HTML, 10);
524        assert_eq!(results.len(), 2);
525        assert_eq!(results[0].title, "First & Best");
526        assert_eq!(results[0].url, "https://example.com/a");
527        assert_eq!(results[0].snippet, "Snippet about the first result");
528        assert_eq!(results[1].url, "https://example.org/b");
529    }
530
531    #[test]
532    fn parses_bing() {
533        let (results, _) = parse_search_results("bing", BING_HTML, 10);
534        assert_eq!(results.len(), 1);
535        assert_eq!(results[0].title, "Bing One");
536        assert_eq!(results[0].url, "https://bing-result.example/1");
537        assert_eq!(results[0].snippet, "Bing snippet one");
538    }
539
540    #[test]
541    fn empty_json_yields_no_results() {
542        let (results, _) = parse_search_results("wikipedia", "not json", 10);
543        assert!(results.is_empty());
544    }
545
546    #[test]
547    fn detects_captcha() {
548        assert!(looks_like_captcha("Please solve the CAPTCHA"));
549        assert!(looks_like_captcha(
550            "Our systems have detected unusual traffic"
551        ));
552        assert!(!looks_like_captcha("normal results page"));
553    }
554
555    #[test]
556    fn formats_markdown() {
557        let result = SearchResult {
558            query: "formal-ai".to_string(),
559            provider: "wikipedia".to_string(),
560            capture_mode: "fetch".to_string(),
561            captured_at: "2026-05-30T00:00:00Z".to_string(),
562            results: vec![SearchResultItem {
563                rank: 1,
564                title: "Formal methods".to_string(),
565                url: "https://en.wikipedia.org/wiki/Formal_methods".to_string(),
566                snippet: "study of formal".to_string(),
567            }],
568            diagnostics: SearchDiagnostics {
569                status: 200,
570                blocked_by_cors: false,
571                blocked_by_captcha: false,
572                source_url: "https://example.com".to_string(),
573                error: None,
574            },
575        };
576        let md = format_search_as_markdown(&result);
577        assert!(md.contains("# Search results for \"formal-ai\""));
578        assert!(md.contains("1. [Formal methods](https://en.wikipedia.org/wiki/Formal_methods)"));
579        assert!(md.contains("study of formal"));
580    }
581
582    #[test]
583    fn serializes_camel_case_contract() {
584        let result = SearchResult {
585            query: "q".to_string(),
586            provider: "wikipedia".to_string(),
587            capture_mode: "fetch".to_string(),
588            captured_at: "t".to_string(),
589            results: vec![],
590            diagnostics: SearchDiagnostics {
591                status: 200,
592                blocked_by_cors: false,
593                blocked_by_captcha: false,
594                source_url: "u".to_string(),
595                error: None,
596            },
597        };
598        let json = serde_json::to_string(&result).unwrap();
599        assert!(json.contains("\"captureMode\""));
600        assert!(json.contains("\"capturedAt\""));
601        assert!(json.contains("\"blockedByCaptcha\""));
602        assert!(json.contains("\"sourceUrl\""));
603        assert!(!json.contains("\"error\""));
604    }
605}