use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use url::form_urlencoded::byte_serialize;
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
pub const SEARCH_PROVIDERS: [&str; 5] = ["wikipedia", "duckduckgo", "google", "bing", "brave"];
pub const DEFAULT_PROVIDER: &str = "wikipedia";
pub const DEFAULT_LIMIT: usize = 10;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct SearchResultItem {
pub rank: usize,
pub title: String,
pub url: String,
pub snippet: String,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "camelCase")]
pub struct SearchDiagnostics {
pub status: u16,
pub blocked_by_cors: bool,
pub blocked_by_captcha: bool,
pub source_url: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub error: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "camelCase")]
pub struct SearchResult {
pub query: String,
pub provider: String,
pub capture_mode: String,
pub captured_at: String,
pub results: Vec<SearchResultItem>,
pub diagnostics: SearchDiagnostics,
}
#[must_use]
pub fn is_supported_provider(provider: &str) -> bool {
SEARCH_PROVIDERS.contains(&provider)
}
fn clean_text(text: &str) -> String {
text.split_whitespace().collect::<Vec<_>>().join(" ")
}
pub fn build_search_url(provider: &str, query: &str, limit: usize) -> Result<String, String> {
let q: String = byte_serialize(query.as_bytes()).collect();
match provider {
"wikipedia" => Ok(format!(
"https://en.wikipedia.org/w/rest.php/v1/search/page?q={q}&limit={limit}"
)),
"duckduckgo" => Ok(format!("https://html.duckduckgo.com/html/?q={q}")),
"google" => Ok(format!("https://www.google.com/search?q={q}&num={limit}")),
"bing" => Ok(format!("https://www.bing.com/search?q={q}&count={limit}")),
"brave" => Ok(format!("https://search.brave.com/search?q={q}")),
other => Err(format!(
"Unknown search provider \"{other}\". Supported: {}",
SEARCH_PROVIDERS.join(", ")
)),
}
}
#[must_use]
pub fn looks_like_captcha(html: &str) -> bool {
let lower = html.to_lowercase();
lower.contains("captcha")
|| lower.contains("unusual traffic")
|| lower.contains("are you a robot")
|| lower.contains("/sorry/index")
|| lower.contains("automated queries")
}
fn resolve_duckduckgo_href(href: &str) -> String {
if href.is_empty() {
return String::new();
}
let normalized = href
.strip_prefix("//")
.map_or_else(|| href.to_string(), |stripped| format!("https:{stripped}"));
if let Ok(parsed) = url::Url::parse(&normalized) {
if let Some((_, value)) = parsed.query_pairs().find(|(k, _)| k == "uddg") {
return value.into_owned();
}
return parsed.to_string();
}
href.to_string()
}
#[derive(Debug, Deserialize)]
struct WikiPage {
key: Option<String>,
title: Option<String>,
excerpt: Option<String>,
description: Option<String>,
}
#[derive(Debug, Deserialize)]
struct WikiResponse {
pages: Option<Vec<WikiPage>>,
}
fn strip_tags(input: &str) -> String {
let mut out = String::with_capacity(input.len());
let mut in_tag = false;
for c in input.chars() {
match c {
'<' => in_tag = true,
'>' => in_tag = false,
_ if !in_tag => out.push(c),
_ => {}
}
}
out
}
fn parse_wikipedia(body: &str, limit: usize) -> Vec<SearchResultItem> {
let parsed: WikiResponse = match serde_json::from_str(body) {
Ok(value) => value,
Err(_) => return Vec::new(),
};
let pages = parsed.pages.unwrap_or_default();
pages
.into_iter()
.take(limit)
.enumerate()
.map(|(i, page)| {
let key = page
.key
.clone()
.or_else(|| page.title.clone())
.unwrap_or_default();
let title = clean_text(&page.title.or(page.key).unwrap_or_default());
let snippet_raw = page.excerpt.or(page.description).unwrap_or_default();
let snippet = clean_text(&strip_tags(&snippet_raw));
let encoded: String = byte_serialize(key.as_bytes()).collect();
SearchResultItem {
rank: i + 1,
title,
url: format!("https://en.wikipedia.org/wiki/{encoded}"),
snippet,
}
})
.collect()
}
fn first_text(element: &scraper::ElementRef, selector: &Selector) -> String {
element
.select(selector)
.next()
.map(|el| clean_text(&el.text().collect::<String>()))
.unwrap_or_default()
}
fn parse_duckduckgo(doc: &Html, limit: usize) -> Vec<SearchResultItem> {
let body_sel = Selector::parse(".result__body").unwrap();
let web_sel = Selector::parse(".web-result").unwrap();
let anchor_sel = Selector::parse("a.result__a").unwrap();
let snippet_sel = Selector::parse(".result__snippet").unwrap();
let mut containers: Vec<_> = doc.select(&body_sel).collect();
if containers.is_empty() {
containers = doc.select(&web_sel).collect();
}
let mut results = Vec::new();
for el in containers {
if results.len() >= limit {
break;
}
if let Some(anchor) = el.select(&anchor_sel).next() {
let title = clean_text(&anchor.text().collect::<String>());
let url = resolve_duckduckgo_href(anchor.value().attr("href").unwrap_or_default());
let snippet = first_text(&el, &snippet_sel);
if !title.is_empty() && !url.is_empty() {
results.push(SearchResultItem {
rank: results.len() + 1,
title,
url,
snippet,
});
}
}
}
results
}
fn parse_google(doc: &Html, limit: usize) -> Vec<SearchResultItem> {
let block_sel = Selector::parse("div.g, div.tF2Cxc, div.MjjYud").unwrap();
let anchor_sel = Selector::parse("a[href^=\"http\"]").unwrap();
let title_sel = Selector::parse("h3").unwrap();
let snippet_sel = Selector::parse("div[data-sncf], .VwiC3b, .IsZvec").unwrap();
let mut results = Vec::new();
for el in doc.select(&block_sel) {
if results.len() >= limit {
break;
}
let url = el
.select(&anchor_sel)
.next()
.and_then(|a| a.value().attr("href"))
.unwrap_or_default()
.to_string();
let title = first_text(&el, &title_sel);
let snippet = first_text(&el, &snippet_sel);
if !title.is_empty() && !url.is_empty() {
results.push(SearchResultItem {
rank: results.len() + 1,
title,
url,
snippet,
});
}
}
results
}
fn parse_bing(doc: &Html, limit: usize) -> Vec<SearchResultItem> {
let block_sel = Selector::parse("li.b_algo").unwrap();
let anchor_sel = Selector::parse("h2 a").unwrap();
let snippet_sel = Selector::parse(".b_caption p, p").unwrap();
let mut results = Vec::new();
for el in doc.select(&block_sel) {
if results.len() >= limit {
break;
}
if let Some(anchor) = el.select(&anchor_sel).next() {
let title = clean_text(&anchor.text().collect::<String>());
let url = anchor.value().attr("href").unwrap_or_default().to_string();
let snippet = first_text(&el, &snippet_sel);
if !title.is_empty() && !url.is_empty() {
results.push(SearchResultItem {
rank: results.len() + 1,
title,
url,
snippet,
});
}
}
}
results
}
fn parse_brave(doc: &Html, limit: usize) -> Vec<SearchResultItem> {
let block_sel = Selector::parse("div.snippet").unwrap();
let anchor_sel = Selector::parse("a[href^=\"http\"]").unwrap();
let title_sel = Selector::parse(".snippet-title, .title").unwrap();
let snippet_sel = Selector::parse(".snippet-description, .snippet-content").unwrap();
let mut results = Vec::new();
for el in doc.select(&block_sel) {
if results.len() >= limit {
break;
}
let anchor = el.select(&anchor_sel).next();
let url = anchor
.and_then(|a| a.value().attr("href"))
.unwrap_or_default()
.to_string();
let mut title = first_text(&el, &title_sel);
if title.is_empty() {
if let Some(a) = anchor {
title = clean_text(&a.text().collect::<String>());
}
}
let snippet = first_text(&el, &snippet_sel);
if !title.is_empty() && !url.is_empty() {
results.push(SearchResultItem {
rank: results.len() + 1,
title,
url,
snippet,
});
}
}
results
}
#[must_use]
pub fn parse_search_results(
provider: &str,
body: &str,
limit: usize,
) -> (Vec<SearchResultItem>, bool) {
if provider == "wikipedia" {
return (parse_wikipedia(body, limit), false);
}
let blocked = looks_like_captcha(body);
let doc = Html::parse_document(body);
let results = match provider {
"duckduckgo" => parse_duckduckgo(&doc, limit),
"google" => parse_google(&doc, limit),
"bing" => parse_bing(&doc, limit),
"brave" => parse_brave(&doc, limit),
_ => Vec::new(),
};
(results, blocked)
}
#[must_use]
pub fn format_search_as_markdown(result: &SearchResult) -> String {
let mut lines = Vec::new();
lines.push(format!("# Search results for \"{}\"", result.query));
lines.push(String::new());
lines.push(format!("- Provider: `{}`", result.provider));
lines.push(format!("- Capture mode: `{}`", result.capture_mode));
lines.push(format!("- Captured at: {}", result.captured_at));
lines.push(format!("- Source: {}", result.diagnostics.source_url));
if result.diagnostics.blocked_by_captcha {
lines.push("- ⚠️ Provider returned a CAPTCHA / bot-block page.".to_string());
}
lines.push(String::new());
if result.results.is_empty() {
lines.push("_No results._".to_string());
return lines.join("\n");
}
for item in &result.results {
lines.push(format!("{}. [{}]({})", item.rank, item.title, item.url));
if !item.snippet.is_empty() {
lines.push(format!(" {}", item.snippet));
}
}
lines.join("\n")
}
pub async fn search(
query: &str,
provider: &str,
limit: usize,
capture_mode: &str,
captured_at: &str,
) -> Result<SearchResult, String> {
if query.trim().is_empty() {
return Err("Missing `query` parameter".to_string());
}
if !is_supported_provider(provider) {
return Err(format!(
"Unknown search provider \"{provider}\". Supported: {}",
SEARCH_PROVIDERS.join(", ")
));
}
let source_url = build_search_url(provider, query, limit)?;
let mut diagnostics = SearchDiagnostics {
status: 0,
blocked_by_cors: false,
blocked_by_captcha: false,
source_url: source_url.clone(),
error: None,
};
let mut results = Vec::new();
let accept = if provider == "wikipedia" {
"application/json"
} else {
"text/html,application/xhtml+xml"
};
match reqwest::Client::builder().user_agent(USER_AGENT).build() {
Ok(client) => {
match client
.get(&source_url)
.header("Accept", accept)
.header("Accept-Language", "en-US,en;q=0.9")
.send()
.await
{
Ok(response) => {
diagnostics.status = response.status().as_u16();
match response.text().await {
Ok(body) => {
let (parsed, blocked) = parse_search_results(provider, &body, limit);
results = parsed;
diagnostics.blocked_by_captcha = blocked;
}
Err(e) => diagnostics.error = Some(e.to_string()),
}
}
Err(e) => diagnostics.error = Some(e.to_string()),
}
}
Err(e) => diagnostics.error = Some(e.to_string()),
}
Ok(SearchResult {
query: query.to_string(),
provider: provider.to_string(),
capture_mode: capture_mode.to_string(),
captured_at: captured_at.to_string(),
results,
diagnostics,
})
}
#[cfg(test)]
mod tests {
use super::*;
const WIKI_JSON: &str = r#"{"pages":[
{"id":1,"key":"Formal_methods","title":"Formal methods","excerpt":"the <span>study</span> of <b>formal</b>","description":"rigorous"},
{"id":2,"key":"Formal_system","title":"Formal system","excerpt":"an abstract structure","description":""}
]}"#;
const DDG_HTML: &str = r#"
<div class="result__body">
<a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fa&rut=abc">First & Best</a>
<div class="result__snippet">Snippet about the <b>first</b> result</div>
</div>
<div class="result__body">
<a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.org%2Fb">Second result</a>
<div class="result__snippet">Snippet two</div>
</div>
"#;
const BING_HTML: &str = r#"
<ol id="b_results">
<li class="b_algo">
<h2><a href="https://bing-result.example/1">Bing One</a></h2>
<div class="b_caption"><p>Bing snippet one</p></div>
</li>
</ol>
"#;
#[test]
fn builds_wikipedia_url() {
assert_eq!(
build_search_url("wikipedia", "formal", 5).unwrap(),
"https://en.wikipedia.org/w/rest.php/v1/search/page?q=formal&limit=5"
);
}
#[test]
fn rejects_unknown_provider_url() {
assert!(build_search_url("yahoo", "x", 5).is_err());
}
#[test]
fn parses_wikipedia_json() {
let (results, blocked) = parse_search_results("wikipedia", WIKI_JSON, 10);
assert!(!blocked);
assert_eq!(results.len(), 2);
assert_eq!(results[0].title, "Formal methods");
assert_eq!(
results[0].url,
"https://en.wikipedia.org/wiki/Formal_methods"
);
assert_eq!(results[0].snippet, "the study of formal");
assert_eq!(
results[1].url,
"https://en.wikipedia.org/wiki/Formal_system"
);
}
#[test]
fn respects_limit() {
let (results, _) = parse_search_results("wikipedia", WIKI_JSON, 1);
assert_eq!(results.len(), 1);
}
#[test]
fn parses_duckduckgo_and_decodes_redirects() {
let (results, _) = parse_search_results("duckduckgo", DDG_HTML, 10);
assert_eq!(results.len(), 2);
assert_eq!(results[0].title, "First & Best");
assert_eq!(results[0].url, "https://example.com/a");
assert_eq!(results[0].snippet, "Snippet about the first result");
assert_eq!(results[1].url, "https://example.org/b");
}
#[test]
fn parses_bing() {
let (results, _) = parse_search_results("bing", BING_HTML, 10);
assert_eq!(results.len(), 1);
assert_eq!(results[0].title, "Bing One");
assert_eq!(results[0].url, "https://bing-result.example/1");
assert_eq!(results[0].snippet, "Bing snippet one");
}
#[test]
fn empty_json_yields_no_results() {
let (results, _) = parse_search_results("wikipedia", "not json", 10);
assert!(results.is_empty());
}
#[test]
fn detects_captcha() {
assert!(looks_like_captcha("Please solve the CAPTCHA"));
assert!(looks_like_captcha(
"Our systems have detected unusual traffic"
));
assert!(!looks_like_captcha("normal results page"));
}
#[test]
fn formats_markdown() {
let result = SearchResult {
query: "formal-ai".to_string(),
provider: "wikipedia".to_string(),
capture_mode: "fetch".to_string(),
captured_at: "2026-05-30T00:00:00Z".to_string(),
results: vec![SearchResultItem {
rank: 1,
title: "Formal methods".to_string(),
url: "https://en.wikipedia.org/wiki/Formal_methods".to_string(),
snippet: "study of formal".to_string(),
}],
diagnostics: SearchDiagnostics {
status: 200,
blocked_by_cors: false,
blocked_by_captcha: false,
source_url: "https://example.com".to_string(),
error: None,
},
};
let md = format_search_as_markdown(&result);
assert!(md.contains("# Search results for \"formal-ai\""));
assert!(md.contains("1. [Formal methods](https://en.wikipedia.org/wiki/Formal_methods)"));
assert!(md.contains("study of formal"));
}
#[test]
fn serializes_camel_case_contract() {
let result = SearchResult {
query: "q".to_string(),
provider: "wikipedia".to_string(),
capture_mode: "fetch".to_string(),
captured_at: "t".to_string(),
results: vec![],
diagnostics: SearchDiagnostics {
status: 200,
blocked_by_cors: false,
blocked_by_captcha: false,
source_url: "u".to_string(),
error: None,
},
};
let json = serde_json::to_string(&result).unwrap();
assert!(json.contains("\"captureMode\""));
assert!(json.contains("\"capturedAt\""));
assert!(json.contains("\"blockedByCaptcha\""));
assert!(json.contains("\"sourceUrl\""));
assert!(!json.contains("\"error\""));
}
}