1use scraper::{Html, Selector};
23use serde::{Deserialize, Serialize};
24use url::form_urlencoded::byte_serialize;
25
26const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
27
28pub const SEARCH_PROVIDERS: [&str; 5] = ["wikipedia", "duckduckgo", "google", "bing", "brave"];
30
31pub const DEFAULT_PROVIDER: &str = "wikipedia";
33
34pub const DEFAULT_LIMIT: usize = 10;
36
37#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
39pub struct SearchResultItem {
40 pub rank: usize,
41 pub title: String,
42 pub url: String,
43 pub snippet: String,
44}
45
46#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
48#[serde(rename_all = "camelCase")]
49pub struct SearchDiagnostics {
50 pub status: u16,
51 pub blocked_by_cors: bool,
52 pub blocked_by_captcha: bool,
53 pub source_url: String,
54 #[serde(skip_serializing_if = "Option::is_none")]
55 pub error: Option<String>,
56}
57
58#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
60#[serde(rename_all = "camelCase")]
61pub struct SearchResult {
62 pub query: String,
63 pub provider: String,
64 pub capture_mode: String,
65 pub captured_at: String,
66 pub results: Vec<SearchResultItem>,
67 pub diagnostics: SearchDiagnostics,
68}
69
70#[must_use]
72pub fn is_supported_provider(provider: &str) -> bool {
73 SEARCH_PROVIDERS.contains(&provider)
74}
75
76fn clean_text(text: &str) -> String {
78 text.split_whitespace().collect::<Vec<_>>().join(" ")
81}
82
83pub fn build_search_url(provider: &str, query: &str, limit: usize) -> Result<String, String> {
89 let q: String = byte_serialize(query.as_bytes()).collect();
90 match provider {
91 "wikipedia" => Ok(format!(
92 "https://en.wikipedia.org/w/rest.php/v1/search/page?q={q}&limit={limit}"
93 )),
94 "duckduckgo" => Ok(format!("https://html.duckduckgo.com/html/?q={q}")),
95 "google" => Ok(format!("https://www.google.com/search?q={q}&num={limit}")),
96 "bing" => Ok(format!("https://www.bing.com/search?q={q}&count={limit}")),
97 "brave" => Ok(format!("https://search.brave.com/search?q={q}")),
98 other => Err(format!(
99 "Unknown search provider \"{other}\". Supported: {}",
100 SEARCH_PROVIDERS.join(", ")
101 )),
102 }
103}
104
105#[must_use]
107pub fn looks_like_captcha(html: &str) -> bool {
108 let lower = html.to_lowercase();
109 lower.contains("captcha")
110 || lower.contains("unusual traffic")
111 || lower.contains("are you a robot")
112 || lower.contains("/sorry/index")
113 || lower.contains("automated queries")
114}
115
116fn resolve_duckduckgo_href(href: &str) -> String {
118 if href.is_empty() {
119 return String::new();
120 }
121 let normalized = href
122 .strip_prefix("//")
123 .map_or_else(|| href.to_string(), |stripped| format!("https:{stripped}"));
124 if let Ok(parsed) = url::Url::parse(&normalized) {
125 if let Some((_, value)) = parsed.query_pairs().find(|(k, _)| k == "uddg") {
126 return value.into_owned();
127 }
128 return parsed.to_string();
129 }
130 href.to_string()
131}
132
133#[derive(Debug, Deserialize)]
135struct WikiPage {
136 key: Option<String>,
137 title: Option<String>,
138 excerpt: Option<String>,
139 description: Option<String>,
140}
141
142#[derive(Debug, Deserialize)]
143struct WikiResponse {
144 pages: Option<Vec<WikiPage>>,
145}
146
147fn strip_tags(input: &str) -> String {
148 let mut out = String::with_capacity(input.len());
149 let mut in_tag = false;
150 for c in input.chars() {
151 match c {
152 '<' => in_tag = true,
153 '>' => in_tag = false,
154 _ if !in_tag => out.push(c),
155 _ => {}
156 }
157 }
158 out
159}
160
161fn parse_wikipedia(body: &str, limit: usize) -> Vec<SearchResultItem> {
162 let parsed: WikiResponse = match serde_json::from_str(body) {
163 Ok(value) => value,
164 Err(_) => return Vec::new(),
165 };
166 let pages = parsed.pages.unwrap_or_default();
167 pages
168 .into_iter()
169 .take(limit)
170 .enumerate()
171 .map(|(i, page)| {
172 let key = page
173 .key
174 .clone()
175 .or_else(|| page.title.clone())
176 .unwrap_or_default();
177 let title = clean_text(&page.title.or(page.key).unwrap_or_default());
178 let snippet_raw = page.excerpt.or(page.description).unwrap_or_default();
179 let snippet = clean_text(&strip_tags(&snippet_raw));
180 let encoded: String = byte_serialize(key.as_bytes()).collect();
181 SearchResultItem {
182 rank: i + 1,
183 title,
184 url: format!("https://en.wikipedia.org/wiki/{encoded}"),
185 snippet,
186 }
187 })
188 .collect()
189}
190
191fn first_text(element: &scraper::ElementRef, selector: &Selector) -> String {
193 element
194 .select(selector)
195 .next()
196 .map(|el| clean_text(&el.text().collect::<String>()))
197 .unwrap_or_default()
198}
199
200fn parse_duckduckgo(doc: &Html, limit: usize) -> Vec<SearchResultItem> {
201 let body_sel = Selector::parse(".result__body").unwrap();
202 let web_sel = Selector::parse(".web-result").unwrap();
203 let anchor_sel = Selector::parse("a.result__a").unwrap();
204 let snippet_sel = Selector::parse(".result__snippet").unwrap();
205
206 let mut containers: Vec<_> = doc.select(&body_sel).collect();
207 if containers.is_empty() {
208 containers = doc.select(&web_sel).collect();
209 }
210
211 let mut results = Vec::new();
212 for el in containers {
213 if results.len() >= limit {
214 break;
215 }
216 if let Some(anchor) = el.select(&anchor_sel).next() {
217 let title = clean_text(&anchor.text().collect::<String>());
218 let url = resolve_duckduckgo_href(anchor.value().attr("href").unwrap_or_default());
219 let snippet = first_text(&el, &snippet_sel);
220 if !title.is_empty() && !url.is_empty() {
221 results.push(SearchResultItem {
222 rank: results.len() + 1,
223 title,
224 url,
225 snippet,
226 });
227 }
228 }
229 }
230 results
231}
232
233fn parse_google(doc: &Html, limit: usize) -> Vec<SearchResultItem> {
234 let block_sel = Selector::parse("div.g, div.tF2Cxc, div.MjjYud").unwrap();
235 let anchor_sel = Selector::parse("a[href^=\"http\"]").unwrap();
236 let title_sel = Selector::parse("h3").unwrap();
237 let snippet_sel = Selector::parse("div[data-sncf], .VwiC3b, .IsZvec").unwrap();
238
239 let mut results = Vec::new();
240 for el in doc.select(&block_sel) {
241 if results.len() >= limit {
242 break;
243 }
244 let url = el
245 .select(&anchor_sel)
246 .next()
247 .and_then(|a| a.value().attr("href"))
248 .unwrap_or_default()
249 .to_string();
250 let title = first_text(&el, &title_sel);
251 let snippet = first_text(&el, &snippet_sel);
252 if !title.is_empty() && !url.is_empty() {
253 results.push(SearchResultItem {
254 rank: results.len() + 1,
255 title,
256 url,
257 snippet,
258 });
259 }
260 }
261 results
262}
263
264fn parse_bing(doc: &Html, limit: usize) -> Vec<SearchResultItem> {
265 let block_sel = Selector::parse("li.b_algo").unwrap();
266 let anchor_sel = Selector::parse("h2 a").unwrap();
267 let snippet_sel = Selector::parse(".b_caption p, p").unwrap();
268
269 let mut results = Vec::new();
270 for el in doc.select(&block_sel) {
271 if results.len() >= limit {
272 break;
273 }
274 if let Some(anchor) = el.select(&anchor_sel).next() {
275 let title = clean_text(&anchor.text().collect::<String>());
276 let url = anchor.value().attr("href").unwrap_or_default().to_string();
277 let snippet = first_text(&el, &snippet_sel);
278 if !title.is_empty() && !url.is_empty() {
279 results.push(SearchResultItem {
280 rank: results.len() + 1,
281 title,
282 url,
283 snippet,
284 });
285 }
286 }
287 }
288 results
289}
290
291fn parse_brave(doc: &Html, limit: usize) -> Vec<SearchResultItem> {
292 let block_sel = Selector::parse("div.snippet").unwrap();
293 let anchor_sel = Selector::parse("a[href^=\"http\"]").unwrap();
294 let title_sel = Selector::parse(".snippet-title, .title").unwrap();
295 let snippet_sel = Selector::parse(".snippet-description, .snippet-content").unwrap();
296
297 let mut results = Vec::new();
298 for el in doc.select(&block_sel) {
299 if results.len() >= limit {
300 break;
301 }
302 let anchor = el.select(&anchor_sel).next();
303 let url = anchor
304 .and_then(|a| a.value().attr("href"))
305 .unwrap_or_default()
306 .to_string();
307 let mut title = first_text(&el, &title_sel);
308 if title.is_empty() {
309 if let Some(a) = anchor {
310 title = clean_text(&a.text().collect::<String>());
311 }
312 }
313 let snippet = first_text(&el, &snippet_sel);
314 if !title.is_empty() && !url.is_empty() {
315 results.push(SearchResultItem {
316 rank: results.len() + 1,
317 title,
318 url,
319 snippet,
320 });
321 }
322 }
323 results
324}
325
326#[must_use]
331pub fn parse_search_results(
332 provider: &str,
333 body: &str,
334 limit: usize,
335) -> (Vec<SearchResultItem>, bool) {
336 if provider == "wikipedia" {
337 return (parse_wikipedia(body, limit), false);
338 }
339 let blocked = looks_like_captcha(body);
340 let doc = Html::parse_document(body);
341 let results = match provider {
342 "duckduckgo" => parse_duckduckgo(&doc, limit),
343 "google" => parse_google(&doc, limit),
344 "bing" => parse_bing(&doc, limit),
345 "brave" => parse_brave(&doc, limit),
346 _ => Vec::new(),
347 };
348 (results, blocked)
349}
350
351#[must_use]
353pub fn format_search_as_markdown(result: &SearchResult) -> String {
354 let mut lines = Vec::new();
355 lines.push(format!("# Search results for \"{}\"", result.query));
356 lines.push(String::new());
357 lines.push(format!("- Provider: `{}`", result.provider));
358 lines.push(format!("- Capture mode: `{}`", result.capture_mode));
359 lines.push(format!("- Captured at: {}", result.captured_at));
360 lines.push(format!("- Source: {}", result.diagnostics.source_url));
361 if result.diagnostics.blocked_by_captcha {
362 lines.push("- ⚠️ Provider returned a CAPTCHA / bot-block page.".to_string());
363 }
364 lines.push(String::new());
365 if result.results.is_empty() {
366 lines.push("_No results._".to_string());
367 return lines.join("\n");
368 }
369 for item in &result.results {
370 lines.push(format!("{}. [{}]({})", item.rank, item.title, item.url));
371 if !item.snippet.is_empty() {
372 lines.push(format!(" {}", item.snippet));
373 }
374 }
375 lines.join("\n")
376}
377
378pub async fn search(
388 query: &str,
389 provider: &str,
390 limit: usize,
391 capture_mode: &str,
392 captured_at: &str,
393) -> Result<SearchResult, String> {
394 if query.trim().is_empty() {
395 return Err("Missing `query` parameter".to_string());
396 }
397 if !is_supported_provider(provider) {
398 return Err(format!(
399 "Unknown search provider \"{provider}\". Supported: {}",
400 SEARCH_PROVIDERS.join(", ")
401 ));
402 }
403
404 let source_url = build_search_url(provider, query, limit)?;
405 let mut diagnostics = SearchDiagnostics {
406 status: 0,
407 blocked_by_cors: false,
408 blocked_by_captcha: false,
409 source_url: source_url.clone(),
410 error: None,
411 };
412 let mut results = Vec::new();
413
414 let accept = if provider == "wikipedia" {
415 "application/json"
416 } else {
417 "text/html,application/xhtml+xml"
418 };
419
420 match reqwest::Client::builder().user_agent(USER_AGENT).build() {
421 Ok(client) => {
422 match client
423 .get(&source_url)
424 .header("Accept", accept)
425 .header("Accept-Language", "en-US,en;q=0.9")
426 .send()
427 .await
428 {
429 Ok(response) => {
430 diagnostics.status = response.status().as_u16();
431 match response.text().await {
432 Ok(body) => {
433 let (parsed, blocked) = parse_search_results(provider, &body, limit);
434 results = parsed;
435 diagnostics.blocked_by_captcha = blocked;
436 }
437 Err(e) => diagnostics.error = Some(e.to_string()),
438 }
439 }
440 Err(e) => diagnostics.error = Some(e.to_string()),
441 }
442 }
443 Err(e) => diagnostics.error = Some(e.to_string()),
444 }
445
446 Ok(SearchResult {
447 query: query.to_string(),
448 provider: provider.to_string(),
449 capture_mode: capture_mode.to_string(),
450 captured_at: captured_at.to_string(),
451 results,
452 diagnostics,
453 })
454}
455
456#[cfg(test)]
457mod tests {
458 use super::*;
459
460 const WIKI_JSON: &str = r#"{"pages":[
461 {"id":1,"key":"Formal_methods","title":"Formal methods","excerpt":"the <span>study</span> of <b>formal</b>","description":"rigorous"},
462 {"id":2,"key":"Formal_system","title":"Formal system","excerpt":"an abstract structure","description":""}
463 ]}"#;
464
465 const DDG_HTML: &str = r#"
466 <div class="result__body">
467 <a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fa&rut=abc">First & Best</a>
468 <div class="result__snippet">Snippet about the <b>first</b> result</div>
469 </div>
470 <div class="result__body">
471 <a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.org%2Fb">Second result</a>
472 <div class="result__snippet">Snippet two</div>
473 </div>
474 "#;
475
476 const BING_HTML: &str = r#"
477 <ol id="b_results">
478 <li class="b_algo">
479 <h2><a href="https://bing-result.example/1">Bing One</a></h2>
480 <div class="b_caption"><p>Bing snippet one</p></div>
481 </li>
482 </ol>
483 "#;
484
485 #[test]
486 fn builds_wikipedia_url() {
487 assert_eq!(
488 build_search_url("wikipedia", "formal", 5).unwrap(),
489 "https://en.wikipedia.org/w/rest.php/v1/search/page?q=formal&limit=5"
490 );
491 }
492
493 #[test]
494 fn rejects_unknown_provider_url() {
495 assert!(build_search_url("yahoo", "x", 5).is_err());
496 }
497
498 #[test]
499 fn parses_wikipedia_json() {
500 let (results, blocked) = parse_search_results("wikipedia", WIKI_JSON, 10);
501 assert!(!blocked);
502 assert_eq!(results.len(), 2);
503 assert_eq!(results[0].title, "Formal methods");
504 assert_eq!(
505 results[0].url,
506 "https://en.wikipedia.org/wiki/Formal_methods"
507 );
508 assert_eq!(results[0].snippet, "the study of formal");
509 assert_eq!(
510 results[1].url,
511 "https://en.wikipedia.org/wiki/Formal_system"
512 );
513 }
514
515 #[test]
516 fn respects_limit() {
517 let (results, _) = parse_search_results("wikipedia", WIKI_JSON, 1);
518 assert_eq!(results.len(), 1);
519 }
520
521 #[test]
522 fn parses_duckduckgo_and_decodes_redirects() {
523 let (results, _) = parse_search_results("duckduckgo", DDG_HTML, 10);
524 assert_eq!(results.len(), 2);
525 assert_eq!(results[0].title, "First & Best");
526 assert_eq!(results[0].url, "https://example.com/a");
527 assert_eq!(results[0].snippet, "Snippet about the first result");
528 assert_eq!(results[1].url, "https://example.org/b");
529 }
530
531 #[test]
532 fn parses_bing() {
533 let (results, _) = parse_search_results("bing", BING_HTML, 10);
534 assert_eq!(results.len(), 1);
535 assert_eq!(results[0].title, "Bing One");
536 assert_eq!(results[0].url, "https://bing-result.example/1");
537 assert_eq!(results[0].snippet, "Bing snippet one");
538 }
539
540 #[test]
541 fn empty_json_yields_no_results() {
542 let (results, _) = parse_search_results("wikipedia", "not json", 10);
543 assert!(results.is_empty());
544 }
545
546 #[test]
547 fn detects_captcha() {
548 assert!(looks_like_captcha("Please solve the CAPTCHA"));
549 assert!(looks_like_captcha(
550 "Our systems have detected unusual traffic"
551 ));
552 assert!(!looks_like_captcha("normal results page"));
553 }
554
555 #[test]
556 fn formats_markdown() {
557 let result = SearchResult {
558 query: "formal-ai".to_string(),
559 provider: "wikipedia".to_string(),
560 capture_mode: "fetch".to_string(),
561 captured_at: "2026-05-30T00:00:00Z".to_string(),
562 results: vec![SearchResultItem {
563 rank: 1,
564 title: "Formal methods".to_string(),
565 url: "https://en.wikipedia.org/wiki/Formal_methods".to_string(),
566 snippet: "study of formal".to_string(),
567 }],
568 diagnostics: SearchDiagnostics {
569 status: 200,
570 blocked_by_cors: false,
571 blocked_by_captcha: false,
572 source_url: "https://example.com".to_string(),
573 error: None,
574 },
575 };
576 let md = format_search_as_markdown(&result);
577 assert!(md.contains("# Search results for \"formal-ai\""));
578 assert!(md.contains("1. [Formal methods](https://en.wikipedia.org/wiki/Formal_methods)"));
579 assert!(md.contains("study of formal"));
580 }
581
582 #[test]
583 fn serializes_camel_case_contract() {
584 let result = SearchResult {
585 query: "q".to_string(),
586 provider: "wikipedia".to_string(),
587 capture_mode: "fetch".to_string(),
588 captured_at: "t".to_string(),
589 results: vec![],
590 diagnostics: SearchDiagnostics {
591 status: 200,
592 blocked_by_cors: false,
593 blocked_by_captcha: false,
594 source_url: "u".to_string(),
595 error: None,
596 },
597 };
598 let json = serde_json::to_string(&result).unwrap();
599 assert!(json.contains("\"captureMode\""));
600 assert!(json.contains("\"capturedAt\""));
601 assert!(json.contains("\"blockedByCaptcha\""));
602 assert!(json.contains("\"sourceUrl\""));
603 assert!(!json.contains("\"error\""));
604 }
605}