use crate::types::{SearchResult, SelectorConfig};
use scraper::{ElementRef, Html, Selector};
use std::sync::OnceLock;
fn sel_tr() -> &'static Selector {
static C: OnceLock<Selector> = OnceLock::new();
C.get_or_init(|| Selector::parse("tr").unwrap())
}
fn sel_strategy2_links() -> &'static Selector {
static C: OnceLock<Selector> = OnceLock::new();
C.get_or_init(|| Selector::parse("#links a[href], .result a[href]").unwrap())
}
pub(crate) struct CompiledSelectors {
pub result_item: Selector,
pub ad_class: Option<Selector>,
pub title_sel: Option<Selector>,
pub snippet: Option<Selector>,
pub display_url_sel: Option<Selector>,
pub ad_classes_raw: Vec<String>,
pub ad_attributes: Vec<(String, String)>,
pub url_patterns: Vec<String>,
}
impl CompiledSelectors {
pub fn compile(cfg: &SelectorConfig) -> Option<Self> {
let result_item = match Selector::parse(&cfg.html_endpoint.result_item) {
Ok(s) => s,
Err(error) => {
tracing::error!(
?error,
selector = %cfg.html_endpoint.result_item,
"Result selector invalid — cannot extract"
);
return None;
}
};
let join_ad = cfg.html_endpoint.ads_filter.ad_classes.join(", ");
let ad_class = if join_ad.is_empty() {
None
} else {
Selector::parse(&join_ad).ok()
};
let title_sel = Selector::parse(&cfg.html_endpoint.title_and_url).ok();
let snippet = Selector::parse(&cfg.html_endpoint.snippet).ok();
let display_url_sel = Selector::parse(&cfg.html_endpoint.display_url).ok();
let ad_classes_raw = cfg
.html_endpoint
.ads_filter
.ad_classes
.iter()
.map(|c| c.trim_start_matches('.').to_string())
.collect();
let ad_attributes = cfg
.html_endpoint
.ads_filter
.ad_attributes
.iter()
.filter_map(|e| {
let mut parts = e.splitn(2, '=');
let key = parts.next()?.trim().to_string();
let value = parts.next()?.trim().to_string();
Some((key, value))
})
.collect();
let url_patterns = cfg.html_endpoint.ads_filter.ad_url_patterns.to_vec();
Some(Self {
result_item,
ad_class,
title_sel,
snippet,
display_url_sel,
ad_classes_raw,
ad_attributes,
url_patterns,
})
}
}
pub(crate) struct CompiledLiteSelectors {
pub link: Selector,
pub snippet_td: Selector,
}
impl CompiledLiteSelectors {
pub fn compile(cfg: &SelectorConfig) -> Option<Self> {
let link = Selector::parse(&cfg.lite_endpoint.result_link)
.or_else(|_| Selector::parse("a.result-link, a"))
.ok()?;
let snippet_td = Selector::parse(&cfg.lite_endpoint.result_snippet)
.or_else(|_| Selector::parse("td.result-snippet, td"))
.ok()?;
Some(Self { link, snippet_td })
}
}
const TITLE_LIMIT: usize = 200;
const URL_LIMIT: usize = 2000;
const SNIPPET_LIMIT: usize = 500;
fn join_text(el: &ElementRef<'_>) -> String {
let mut out = String::with_capacity(128);
let mut need_space = false;
for frag in el.text() {
for word in frag.split_whitespace() {
if need_space {
out.push(' ');
}
out.push_str(word);
need_space = true;
}
}
out
}
pub fn extract_results(raw_html: &str) -> Vec<SearchResult> {
let cfg = SelectorConfig::default();
extract_results_with_cfg(raw_html, &cfg)
}
pub fn extract_results_with_cfg(raw_html: &str, cfg: &SelectorConfig) -> Vec<SearchResult> {
let document = Html::parse_document(raw_html);
let Some(compiled) = CompiledSelectors::compile(cfg) else {
return Vec::new();
};
extract_with_document(&document, &compiled)
}
pub fn extract_results_with_strategies(raw_html: &str) -> Vec<SearchResult> {
let cfg = SelectorConfig::default();
extract_results_with_strategies_cfg(raw_html, &cfg)
}
pub fn extract_results_with_strategies_cfg(
raw_html: &str,
cfg: &SelectorConfig,
) -> Vec<SearchResult> {
let document = Html::parse_document(raw_html);
let mut results = match CompiledSelectors::compile(cfg) {
Some(compiled) => extract_with_document(&document, &compiled),
None => Vec::new(),
};
if !results.is_empty() {
return results;
}
tracing::debug!("Strategy 1 returned empty — trying Strategy 2 (semantic fallback)");
results = extract_strategy_2(&document);
if !results.is_empty() {
tracing::info!(total = results.len(), "Strategy 2 recovered results");
}
results
}
fn extract_strategy_2(document: &Html) -> Vec<SearchResult> {
let links_selector = sel_strategy2_links();
let mut results = Vec::with_capacity(16);
let mut position: u32 = 0;
let mut seen_urls: std::collections::HashSet<String> =
std::collections::HashSet::with_capacity(16);
for link in document.select(links_selector) {
let href = match link.value().attr("href") {
Some(h) if !h.is_empty() => h,
_ => continue,
};
let resolved_url = match resolve_url(href) {
Some(u) => u,
None => continue,
};
if resolved_url.contains("duckduckgo.com/y.js") || resolved_url.len() > URL_LIMIT {
continue;
}
if !seen_urls.insert(resolved_url.clone()) {
continue;
}
let raw_title = join_text(&link);
let title = normalize_text(&raw_title, TITLE_LIMIT);
if title.is_empty() {
continue;
}
let snippet = extract_snippet_from_ancestor(&link, &title);
position += 1;
results.push(SearchResult {
position,
title,
url: resolved_url,
display_url: None,
snippet,
original_title: None,
content: None,
content_size: None,
content_extraction_method: None,
});
if results.len() >= 50 {
break;
}
}
results
}
fn extract_snippet_from_ancestor(link: &ElementRef<'_>, title: &str) -> Option<String> {
let mut atual = link.parent();
let mut nivel = 0;
while let Some(no) = atual {
nivel += 1;
if nivel > 5 {
break;
}
if let Some(el) = ElementRef::wrap(no) {
let text = join_text(&el);
let normalized = normalize_text(&text, SNIPPET_LIMIT);
let without_title = normalized.replacen(title, "", 1);
let without_title_tr = without_title.trim();
if without_title_tr.chars().count() >= 40 {
return Some(normalize_text(without_title_tr, SNIPPET_LIMIT));
}
}
atual = no.parent();
}
None
}
pub fn extract_results_lite(raw_html: &str) -> Vec<SearchResult> {
let cfg = SelectorConfig::default();
extract_results_lite_with_cfg(raw_html, &cfg)
}
pub fn extract_results_lite_with_cfg(raw_html: &str, cfg: &SelectorConfig) -> Vec<SearchResult> {
let document = Html::parse_document(raw_html);
let Some(compiled_lite) = CompiledLiteSelectors::compile(cfg) else {
return Vec::new();
};
let sel_link = &compiled_lite.link;
let sel_snippet_td = &compiled_lite.snippet_td;
let mut results: Vec<SearchResult> = Vec::with_capacity(16);
let mut position: u32 = 0;
let mut pending_title: Option<(String, String)> = None;
for tr in document.select(sel_tr()) {
let link_candidate = tr.select(sel_link).next();
if let Some(link) = link_candidate {
let is_result_link = link
.value()
.attr("class")
.map(|c| c.contains("result-link"))
.unwrap_or(false);
if is_result_link || pending_title.is_none() {
if let Some(href) = link.value().attr("href") {
if let Some(resolved_url) = resolve_url(href) {
if resolved_url.contains("duckduckgo.com/y.js") {
continue;
}
let raw_title = join_text(&link);
let title = normalize_text(&raw_title, TITLE_LIMIT);
if !title.is_empty() && !resolved_url.contains("duckduckgo.com") {
if let Some((pending_t, pending_u)) = pending_title.take() {
position += 1;
results.push(SearchResult {
position,
title: pending_t,
url: pending_u,
display_url: None,
snippet: None,
original_title: None,
content: None,
content_size: None,
content_extraction_method: None,
});
}
pending_title = Some((title, resolved_url));
continue;
}
}
}
}
}
if let Some((title, url)) = pending_title.take() {
let snippet_text = tr
.select(sel_snippet_td)
.map(|td| join_text(&td))
.find(|t| t.split_whitespace().count() > 5);
let snippet = snippet_text.map(|t| normalize_text(&t, SNIPPET_LIMIT));
position += 1;
results.push(SearchResult {
position,
title,
url,
display_url: None,
snippet,
original_title: None,
content: None,
content_size: None,
content_extraction_method: None,
});
}
if results.len() >= 50 {
break;
}
}
if let Some((title, url)) = pending_title {
position += 1;
results.push(SearchResult {
position,
title,
url,
display_url: None,
snippet: None,
original_title: None,
content: None,
content_size: None,
content_extraction_method: None,
});
}
results
}
fn extract_with_document(document: &Html, compiled: &CompiledSelectors) -> Vec<SearchResult> {
let mut results = Vec::with_capacity(16);
let mut position: u32 = 0;
for result_element in document.select(&compiled.result_item) {
if let Some(ref ad_sel) = compiled.ad_class {
if result_element.select(ad_sel).next().is_some()
|| contem_classe_anuncio_dinamico(&result_element, &compiled.ad_classes_raw)
{
tracing::trace!("Result filtered by ad class");
continue;
}
}
let mut filtered_by_attribute = false;
for (key, value) in &compiled.ad_attributes {
if result_element.value().attr(key.as_str()) == Some(value.as_str()) {
tracing::trace!(attribute = %key, "Result filtered by ad attribute");
filtered_by_attribute = true;
break;
}
}
if filtered_by_attribute {
continue;
}
let Some(ref title_selector) = compiled.title_sel else {
continue;
};
let title_element = match result_element.select(title_selector).next() {
Some(e) => e,
None => {
tracing::trace!("Result missing title element — skipping");
continue;
}
};
let raw_title = join_text(&title_element);
let title = normalize_text(&raw_title, TITLE_LIMIT);
if title.is_empty() {
continue;
}
let raw_url = match title_element.value().attr("href") {
Some(href) => href.to_string(),
None => {
tracing::trace!("Title missing href attribute — skipping");
continue;
}
};
let resolved_url = match resolve_url(&raw_url) {
Some(u) => u,
None => {
tracing::trace!(url = %raw_url, "URL filtered or invalid");
continue;
}
};
if compiled
.url_patterns
.iter()
.any(|p| resolved_url.contains(p))
{
tracing::trace!(url = %resolved_url, "URL filtered by ad pattern");
continue;
}
if resolved_url.len() > URL_LIMIT {
tracing::trace!(size = resolved_url.len(), "URL exceeds limit — skipping");
continue;
}
let snippet = compiled.snippet.as_ref().and_then(|sel| {
result_element
.select(sel)
.next()
.map(|el| normalize_text(&join_text(&el), SNIPPET_LIMIT))
.filter(|s| !s.is_empty())
});
let display_url = compiled.display_url_sel.as_ref().and_then(|sel| {
result_element
.select(sel)
.next()
.map(|el| normalize_text(&join_text(&el), URL_LIMIT))
.filter(|s| !s.is_empty())
});
let (final_title, original_title) =
apply_official_site_heuristic(title, display_url.as_deref());
position += 1;
results.push(SearchResult {
position,
title: final_title,
url: resolved_url,
display_url,
snippet,
original_title,
content: None,
content_size: None,
content_extraction_method: None,
});
}
tracing::debug!(
total = results.len(),
"Extraction complete after ad filtering"
);
results
}
fn contem_classe_anuncio_dinamico(element: &ElementRef<'_>, raw_classes: &[String]) -> bool {
element
.value()
.classes()
.any(|class| raw_classes.iter().any(|c| c == class))
}
fn apply_official_site_heuristic(
title: String,
display_url: Option<&str>,
) -> (String, Option<String>) {
if title.eq_ignore_ascii_case("Official site") {
if let Some(friendly_url) = display_url.map(str::trim).filter(|s| !s.is_empty()) {
return (friendly_url.to_string(), Some(title));
}
}
(title, None)
}
fn normalize_text(raw: &str, limit: usize) -> String {
let mut result_buf = String::with_capacity(raw.len().min(limit + 64));
let mut needs_space = false;
let mut chars_written: usize = 0;
for word in raw.split_whitespace() {
let separator = usize::from(needs_space);
let word_len = word.chars().count();
if chars_written + separator + word_len > limit {
let remaining = limit.saturating_sub(chars_written + separator);
if remaining > 0 {
if needs_space {
result_buf.push(' ');
}
for ch in word.chars().take(remaining) {
result_buf.push(ch);
}
}
break;
}
if needs_space {
result_buf.push(' ');
chars_written += 1;
}
result_buf.push_str(word);
chars_written += word_len;
needs_space = true;
}
result_buf
}
pub fn resolve_url(href: &str) -> Option<String> {
let href_trim = href.trim();
if href_trim.is_empty() {
return None;
}
let normalized = if let Some(rest) = href_trim.strip_prefix("//") {
format!("https://{rest}")
} else if href_trim.starts_with('/') {
format!("https://duckduckgo.com{href_trim}")
} else {
href_trim.to_string()
};
if let Some(uddg_decoded) = extract_uddg(&normalized) {
return Some(uddg_decoded);
}
if eh_url_duckduckgo(&normalized) {
return None;
}
Some(normalized)
}
fn extract_uddg(url: &str) -> Option<String> {
let idx_uddg = url.find("uddg=")?;
let after_equals = &url[idx_uddg + "uddg=".len()..];
let encoded_value = match after_equals.find('&') {
Some(end) => &after_equals[..end],
None => after_equals,
};
urlencoding::decode(encoded_value)
.ok()
.map(|cow| cow.into_owned())
}
fn eh_url_duckduckgo(url: &str) -> bool {
let after_proto = if let Some(pos) = url.find("://") {
&url[pos + 3..]
} else {
url
};
let host = after_proto
.split('/')
.next()
.unwrap_or(after_proto)
.split('?')
.next()
.unwrap_or(after_proto);
host.eq_ignore_ascii_case("duckduckgo.com")
|| host.eq_ignore_ascii_case("html.duckduckgo.com")
|| host.eq_ignore_ascii_case("lite.duckduckgo.com")
|| (host.len() > ".duckduckgo.com".len()
&& host[host.len() - ".duckduckgo.com".len()..].eq_ignore_ascii_case(".duckduckgo.com"))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn resolver_url_prefixa_protocol_relative() {
assert_eq!(
resolve_url("//exemplo.com/caminho"),
Some("https://exemplo.com/caminho".to_string())
);
}
#[test]
fn resolver_url_desencapsula_redirect_uddg() {
let href = "//duckduckgo.com/l/?uddg=https%3A%2F%2Fexemplo.com%2Fnoticia&rut=abc123";
let resolvida = resolve_url(href).expect("should decode uddg");
assert_eq!(resolvida, "https://exemplo.com/noticia");
}
#[test]
fn resolve_url_unwraps_uddg_with_absolute_path() {
let href = "/l/?uddg=https%3A%2F%2Fexemplo.com%2Farticle";
let resolvida = resolve_url(href).expect("should decode uddg");
assert_eq!(resolvida, "https://exemplo.com/article");
}
#[test]
fn resolve_url_filters_duckduckgo_without_uddg() {
assert_eq!(resolve_url("https://duckduckgo.com/settings"), None);
assert_eq!(resolve_url("//html.duckduckgo.com/html/?q=teste"), None);
}
#[test]
fn resolver_url_mantem_absolutas_externas() {
assert_eq!(
resolve_url("https://exemplo.com.br/noticia"),
Some("https://exemplo.com.br/noticia".to_string())
);
}
#[test]
fn resolve_url_returns_none_for_empty_string() {
assert_eq!(resolve_url(""), None);
assert_eq!(resolve_url(" "), None);
}
#[test]
fn normalize_text_colapsa_whitespace() {
assert_eq!(
normalize_text(" olá mundo\n\n\ttexto ", 100),
"olá mundo texto"
);
}
#[test]
fn normalize_text_trunca_respeitando_char_boundary() {
let long_text = "á".repeat(300);
let truncated = normalize_text(&long_text, 200);
assert_eq!(truncated.chars().count(), 200);
}
#[test]
fn extract_results_works_with_minimal_html() {
let html = r#"
<html><body>
<div id="links">
<div class="result">
<a class="result__a" href="//exemplo.com/pagina">Título Exemplo</a>
<a class="result__snippet">Esta é uma descrição de exemplo.</a>
<span class="result__url">exemplo.com</span>
</div>
<div class="result result--ad">
<a class="result__a" href="//anuncio.com">Anúncio Pago</a>
</div>
<div class="result">
<a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fwikipedia.org%2Fwiki%2FRust">Rust</a>
<a class="result__snippet">Linguagem de programação Rust.</a>
</div>
</div>
</body></html>
"#;
let results = extract_results(html);
assert_eq!(results.len(), 2, "deve filtrar o anúncio");
assert_eq!(results[0].position, 1);
assert_eq!(results[0].title, "Título Exemplo");
assert_eq!(results[0].url, "https://exemplo.com/pagina");
assert_eq!(
results[0].snippet.as_deref(),
Some("Esta é uma descrição de exemplo.")
);
assert_eq!(results[1].position, 2);
assert_eq!(results[1].title, "Rust");
assert_eq!(results[1].url, "https://wikipedia.org/wiki/Rust");
}
#[test]
fn extract_results_filters_js_urls() {
let html = r#"
<div id="links">
<div class="result">
<a class="result__a" href="//duckduckgo.com/y.js?ad=1">Tracker</a>
</div>
<div class="result">
<a class="result__a" href="//site-valido.com/pagina">Válido</a>
</div>
</div>
"#;
let results = extract_results(html);
assert_eq!(results.len(), 1);
assert_eq!(results[0].title, "Válido");
}
#[test]
fn extract_results_respects_data_nrn_ad_attribute() {
let html = r#"
<div id="links">
<div class="result" data-nrn="ad">
<a class="result__a" href="//anuncio.com">Patrocinado</a>
</div>
<div class="result" data-nrn="organic">
<a class="result__a" href="//organico.com">Orgânico</a>
</div>
</div>
"#;
let results = extract_results(html);
assert_eq!(results.len(), 1);
assert_eq!(results[0].url, "https://organico.com");
}
#[test]
fn extract_results_empty_returns_empty_vec() {
let html = "<html><body>Sem results</body></html>";
let results = extract_results(html);
assert!(results.is_empty());
}
#[test]
fn strategy_2_recovers_when_classes_absent() {
let html = r#"
<html><body>
<div id="links">
<div>
<a href="//exemplo.com/artigo">Título do Artigo de Exemplo</a>
<p>Este é o snippet descritivo do artigo que precisa ter texto suficiente para ser considerado substancial e assim ser capturado como snippet pela heurística de extração.</p>
</div>
<div>
<a href="//outro-site.com/noticia">Notícia Externa Importante</a>
<p>Descrição relevante da notícia com mais de quarenta caracteres para garantir captura pela heurística de snippet.</p>
</div>
</div>
</body></html>
"#;
let results = extract_results_with_strategies(html);
assert!(
results.len() >= 2,
"Estratégia 2 deve recuperar pelo menos 2 results"
);
assert_eq!(results[0].title, "Título do Artigo de Exemplo");
assert_eq!(results[0].url, "https://exemplo.com/artigo");
}
#[test]
fn strategy_2_does_not_run_if_strategy_1_worked() {
let html = r#"
<html><body>
<div id="links">
<div class="result">
<a class="result__a" href="//valido.com">Válido via Estratégia 1</a>
<a class="result__snippet">Snippet curto.</a>
</div>
</div>
</body></html>
"#;
let results = extract_results_with_strategies(html);
assert_eq!(results.len(), 1);
assert_eq!(results[0].title, "Válido via Estratégia 1");
}
#[test]
fn extract_results_lite_parses_duckduckgo_lite_table() {
let html = r#"
<html><body>
<table>
<tr>
<td valign="top">1. </td>
<td><a rel="nofollow" href="//exemplo.com/pagina1" class="result-link">Primeiro Resultado Lite</a></td>
</tr>
<tr>
<td> </td>
<td class="result-snippet">Esta é a descrição do primeiro resultado com texto suficiente para ser reconhecido.</td>
</tr>
<tr>
<td valign="top">2. </td>
<td><a rel="nofollow" href="//exemplo.com/pagina2" class="result-link">Segundo Resultado Lite</a></td>
</tr>
<tr>
<td> </td>
<td class="result-snippet">Descrição do segundo resultado com bastante texto também.</td>
</tr>
</table>
</body></html>
"#;
let results = extract_results_lite(html);
assert_eq!(results.len(), 2);
assert_eq!(results[0].position, 1);
assert_eq!(results[0].title, "Primeiro Resultado Lite");
assert_eq!(results[0].url, "https://exemplo.com/pagina1");
assert!(results[0].snippet.is_some());
assert_eq!(results[1].title, "Segundo Resultado Lite");
}
#[test]
fn extract_results_lite_empty_returns_empty_vec() {
let html = "<html><body><p>Nada aqui</p></body></html>";
let results = extract_results_lite(html);
assert!(results.is_empty());
}
#[test]
fn extract_results_with_custom_cfg_uses_alternate_selector() {
let html = r#"
<div id="custom-links">
<div class="custom-result">
<a class="custom-title" href="//site.com/a">Título A</a>
<span class="custom-snippet">Snippet A</span>
</div>
<div class="custom-result">
<a class="custom-title" href="//site.com/b">Título B</a>
<span class="custom-snippet">Snippet B</span>
</div>
</div>
"#;
let padrao = extract_results(html);
assert!(
padrao.is_empty(),
"default não deve casar com .custom-result"
);
let mut cfg = SelectorConfig::default();
cfg.html_endpoint.result_item = "#custom-links .custom-result".to_string();
cfg.html_endpoint.title_and_url = ".custom-title".to_string();
cfg.html_endpoint.snippet = ".custom-snippet".to_string();
let results = extract_results_with_cfg(html, &cfg);
assert_eq!(results.len(), 2);
assert_eq!(results[0].title, "Título A");
assert_eq!(results[1].title, "Título B");
}
#[test]
fn extract_results_with_cfg_filters_custom_classes() {
let html = r#"
<div id="links">
<div class="result organic">
<a class="result__a" href="//a.com">Orgânico</a>
</div>
<div class="result my-custom-ad">
<a class="result__a" href="//ad.com">Anúncio Custom</a>
</div>
</div>
"#;
let mut cfg = SelectorConfig::default();
cfg.html_endpoint.ads_filter.ad_classes = vec![".my-custom-ad".to_string()];
let results = extract_results_with_cfg(html, &cfg);
assert_eq!(results.len(), 1);
assert_eq!(results[0].url, "https://a.com");
}
#[test]
fn extract_results_lite_filters_duckduckgo_links() {
let html = r#"
<table>
<tr><td><a href="//duckduckgo.com/about" class="result-link">Sobre DDG</a></td></tr>
<tr><td class="result-snippet">Snippet do DDG não deve aparecer.</td></tr>
<tr><td><a href="//externo.com/doc" class="result-link">Doc Externa</a></td></tr>
<tr><td class="result-snippet">Descrição da documentação externa relevante.</td></tr>
</table>
"#;
let results = extract_results_lite(html);
assert_eq!(results.len(), 1);
assert_eq!(results[0].url, "https://externo.com/doc");
}
}