use std::collections::{HashMap, HashSet};
use std::sync::LazyLock;
use crate::client::SearxngResult;
const K_RRF: f64 = 60.0;
const K1: f64 = 1.2;
const B: f64 = 0.5;
const MIN_COVERAGE: f64 = 0.5;
pub static STOPWORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
[
"top", "best", "good", "greatest", "finest", "cheapest", "cheap", "the", "a", "an", "in",
"of", "to", "for", "and", "or", "near", "how", "is", "are", "do", "does", "from", "with",
"you", "your", "should", "per",
"what",
]
.into_iter()
.collect()
});
static JUNK_HOSTS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
[
"merriam-webster.com",
"dictionary.cambridge.org",
"usdictionary.com",
"dictionary.com",
"vocabulary.com",
"thefreedictionary.com",
"collinsdictionary.com",
"wiktionary.org",
"zara.com",
"bestbuy.com",
"ebay.com",
"aliexpress.com",
"foxnews.com",
"apnews.com",
"news.google.com",
"culturedcode.com",
"thingiverse.com",
"apps.apple.com",
"fix.com",
]
.into_iter()
.collect()
});
const JUNK_HOST_SUFFIXES: &[&str] = &["myshopify.com"];
struct GeoEntry {
region: &'static [&'static str],
competing: &'static [&'static str],
}
static GEO: LazyLock<HashMap<&'static str, GeoEntry>> = LazyLock::new(|| {
HashMap::from([
(
"belgrad",
GeoEntry {
region: &["belgrade", "beograd", "serbia"],
competing: &["istanbul", "forest", "turkey", "maine", "lakes", "montana"],
},
),
(
"lisbon",
GeoEntry {
region: &["lisbon", "lisboa", "portugal"],
competing: &[],
},
),
(
"kyoto",
GeoEntry {
region: &["kyoto", "japan"],
competing: &[],
},
),
(
"tbilisi",
GeoEntry {
region: &["tbilisi", "georgia"],
competing: &["atlanta"],
},
),
(
"danang",
GeoEntry {
region: &["nang", "danang", "vietnam"],
competing: &[],
},
),
(
"porto",
GeoEntry {
region: &["porto", "portugal"],
competing: &[],
},
),
(
"tokyo",
GeoEntry {
region: &["tokyo", "japan"],
competing: &[],
},
),
(
"oaxaca",
GeoEntry {
region: &["oaxaca", "mexico"],
competing: &[],
},
),
(
"zurich",
GeoEntry {
region: &["zurich", "switzerland", "swiss"],
competing: &[],
},
),
(
"vienna",
GeoEntry {
region: &["vienna", "austria", "wien"],
competing: &["virginia"],
},
),
])
});
fn norm(s: &str) -> String {
s.to_lowercase()
.chars()
.map(fold_diacritic)
.collect::<String>()
}
fn fold_diacritic(c: char) -> char {
match c {
'á' | 'à' | 'â' | 'ä' | 'ã' | 'å' => 'a',
'é' | 'è' | 'ê' | 'ë' => 'e',
'í' | 'ì' | 'î' | 'ï' => 'i',
'ó' | 'ò' | 'ô' | 'ö' | 'õ' => 'o',
'ú' | 'ù' | 'û' | 'ü' => 'u',
'ç' => 'c',
'ñ' => 'n',
other => other,
}
}
fn toks(s: &str) -> Vec<String> {
norm(s)
.split(|c: char| !c.is_ascii_alphanumeric())
.filter(|t| !t.is_empty())
.map(|t| t.to_string())
.collect()
}
fn domain(url: &str) -> String {
let host = url
.split("//")
.nth(1)
.and_then(|rest| rest.split('/').next())
.unwrap_or("")
.split('@')
.next_back()
.unwrap_or("")
.split(':')
.next()
.unwrap_or("")
.to_lowercase();
host.strip_prefix("www.").unwrap_or(&host).to_string()
}
fn registrable(url: &str) -> String {
let d = domain(url);
let parts: Vec<&str> = d.split('.').collect();
if parts.len() >= 2 {
format!("{}.{}", parts[parts.len() - 2], parts[parts.len() - 1])
} else {
d
}
}
fn url_of(r: &SearxngResult) -> &str {
r.url.as_deref().unwrap_or("")
}
fn title_of(r: &SearxngResult) -> &str {
r.title.as_deref().unwrap_or("")
}
fn content_of(r: &SearxngResult) -> &str {
r.content.as_deref().unwrap_or("")
}
#[allow(dead_code)]
fn rrf(r: &SearxngResult) -> f64 {
if r.positions.is_empty() {
1.0 / (K_RRF + 1.0) } else {
r.positions.iter().map(|&p| 1.0 / (K_RRF + p as f64)).sum()
}
}
#[allow(dead_code)]
fn minmax(vals: &[f64]) -> impl Fn(f64) -> f64 {
let lo = vals.iter().copied().fold(f64::INFINITY, f64::min);
let hi = vals.iter().copied().fold(f64::NEG_INFINITY, f64::max);
let rng = hi - lo;
move |v: f64| if rng > 1e-9 { (v - lo) / rng } else { 0.0 }
}
#[allow(dead_code)]
fn doc_tokens(r: &SearxngResult) -> Vec<String> {
let mut d = toks(title_of(r));
d.extend(toks(title_of(r)));
d.extend(toks(content_of(r)));
d
}
#[allow(dead_code)]
fn bm25_lite(rows: &[&SearxngResult], important: &HashSet<String>) -> Vec<f64> {
let docs: Vec<Vec<String>> = rows.iter().map(|r| doc_tokens(r)).collect();
let n = docs.len().max(1) as f64;
let avgdl = docs.iter().map(|d| d.len()).sum::<usize>() as f64 / n;
let mut df: HashMap<&str, usize> = HashMap::new();
for d in &docs {
let uniq: HashSet<&str> = d.iter().map(String::as_str).collect();
for t in uniq {
*df.entry(t).or_insert(0) += 1;
}
}
let n_docs = docs.len() as f64;
docs.iter()
.map(|d| {
let dl = d.len() as f64;
let mut rel = 0.0;
for term in important {
let tf = d.iter().filter(|t| t.as_str() == term.as_str()).count() as f64;
if tf == 0.0 {
continue;
}
let dfi = *df.get(term.as_str()).unwrap_or(&0) as f64;
let idf = (1.0 + (n_docs - dfi + 0.5) / (dfi + 0.5)).ln();
rel += idf * (tf * (K1 + 1.0)) / (tf + K1 * (1.0 - B + B * dl / avgdl.max(1.0)));
}
rel
})
.collect()
}
fn is_junk(r: &SearxngResult) -> bool {
let url = url_of(r);
let d = domain(url);
if JUNK_HOSTS.contains(d.as_str()) || JUNK_HOST_SUFFIXES.iter().any(|s| d.ends_with(s)) {
return true;
}
let title = norm(title_of(r));
let title_toks = toks(title_of(r));
if title_toks.len() <= 6
&& [
"definition",
"meaning",
"synonym",
"synonyms",
"antonym",
"antonyms",
]
.iter()
.any(|kw| {
title
.split(|c: char| !c.is_ascii_alphanumeric())
.any(|w| w == *kw)
})
{
return true;
}
for needle in [
"just a moment",
"attention required",
"verify you are human",
"are you a robot",
"access denied",
"enable javascript",
] {
if title.contains(needle) {
return true;
}
}
let url_l = url.to_lowercase();
if url_l.contains("/mapfiles/")
|| url_l.contains("/apple-app-site-association/")
|| url_l.contains("/.well-known/")
{
return true;
}
false
}
fn covers(r: &SearxngResult, important: &HashSet<String>) -> bool {
if important.is_empty() {
return true;
}
let mut doc: HashSet<String> = toks(title_of(r)).into_iter().collect();
doc.extend(toks(content_of(r)));
let hit = important.iter().filter(|t| doc.contains(*t)).count();
hit as f64 / important.len() as f64 >= MIN_COVERAGE
}
fn coverage_count(r: &SearxngResult, important: &HashSet<String>) -> usize {
if important.is_empty() {
return 0;
}
let mut doc: HashSet<String> = toks(title_of(r)).into_iter().collect();
doc.extend(toks(content_of(r)));
important.iter().filter(|t| doc.contains(*t)).count()
}
fn geo_competing(r: &SearxngResult, competing: &[&str]) -> bool {
if competing.is_empty() {
return false;
}
let blob = norm(&format!("{} {} {}", title_of(r), content_of(r), url_of(r)));
competing.iter().any(|c| blob.contains(c))
}
#[allow(dead_code)]
fn geo_score(r: &SearxngResult, region: &[&str], competing: &[&str]) -> f64 {
if region.is_empty() {
return 0.0;
}
let blob = norm(&format!("{} {} {}", title_of(r), content_of(r), url_of(r)));
let mut s = 0.0;
if region.iter().any(|t| blob.contains(t)) {
s += 1.0;
}
if !competing.is_empty() && competing.iter().any(|c| blob.contains(c)) {
s -= 1.0;
}
s
}
fn geo_for(query: &str) -> (&'static [&'static str], &'static [&'static str]) {
let qn: HashSet<String> = toks(query).into_iter().collect();
for (key, entry) in GEO.iter() {
if qn.contains(*key) || (*key == "danang" && qn.contains("nang")) {
return (entry.region, entry.competing);
}
}
(&[], &[])
}
fn important_terms(query: &str) -> HashSet<String> {
toks(query)
.into_iter()
.filter(|t| !STOPWORDS.contains(t.as_str()))
.collect()
}
pub fn rerank<'a>(rows: &'a [SearxngResult], query: &str) -> Vec<&'a SearxngResult> {
rerank_core(rows, query, false)
}
pub fn rerank_relevance<'a>(rows: &'a [SearxngResult], query: &str) -> Vec<&'a SearxngResult> {
rerank_core(rows, query, true)
}
fn rerank_core<'a>(
rows: &'a [SearxngResult],
query: &str,
relevance: bool,
) -> Vec<&'a SearxngResult> {
if rows.is_empty() {
return Vec::new();
}
let important = important_terms(query);
let (_region, competing) = geo_for(query);
let non_junk: Vec<&SearxngResult> = rows.iter().filter(|r| !is_junk(r)).collect();
let mut cands: Vec<&SearxngResult> = non_junk
.iter()
.copied()
.filter(|r| covers(r, &important))
.filter(|r| !geo_competing(r, competing))
.collect();
if cands.is_empty() {
cands = if non_junk.is_empty() {
rows.iter().collect()
} else {
non_junk
};
}
if relevance && !important.is_empty() {
let covs: Vec<usize> = cands
.iter()
.map(|r| coverage_count(r, &important))
.collect();
let max_cov = covs.iter().copied().max().unwrap_or(0);
if max_cov > 0 {
let filtered: Vec<&SearxngResult> = cands
.iter()
.copied()
.zip(covs.iter().copied())
.filter(|&(_, c)| c == max_cov)
.map(|(r, _)| r)
.collect();
if !filtered.is_empty() {
cands = filtered;
}
}
}
cands.sort_by(|a, b| {
let sa = a.score.unwrap_or(0.0);
let sb = b.score.unwrap_or(0.0);
sb.partial_cmp(&sa).unwrap_or(std::cmp::Ordering::Equal)
});
let mut seen: HashSet<String> = HashSet::new();
let mut out: Vec<&SearxngResult> = Vec::with_capacity(cands.len());
for r in cands {
let rd = registrable(url_of(r));
if !seen.insert(rd) {
continue;
}
out.push(r);
}
out
}
#[cfg(test)]
mod tests {
use super::*;
fn row(url: &str, title: &str, content: &str, positions: Vec<u32>) -> SearxngResult {
SearxngResult {
url: Some(url.into()),
title: Some(title.into()),
engine: Some("test".into()),
content: Some(content.into()),
score: Some(1.0),
engines: Vec::new(),
positions,
category: Some("general".into()),
template: None,
published_date: None,
img_src: None,
thumbnail_src: None,
img_format: None,
resolution: None,
}
}
#[test]
fn domain_strips_www_and_port() {
assert_eq!(domain("https://www.Example.com:8080/path"), "example.com");
assert_eq!(domain("http://sub.example.org/x"), "sub.example.org");
}
#[test]
fn registrable_takes_last_two_labels() {
assert_eq!(
registrable("https://dictionary.cambridge.org/x"),
"cambridge.org"
);
assert_eq!(
registrable("https://www.tripadvisor.com/y"),
"tripadvisor.com"
);
}
#[test]
fn junk_dictionary_host_dropped() {
let r = row(
"https://www.merriam-webster.com/dictionary/best",
"best Definition",
"",
vec![1],
);
assert!(is_junk(&r));
}
#[test]
fn junk_bot_check_title_dropped() {
let r = row("https://example.com/", "Just a moment...", "", vec![1]);
assert!(is_junk(&r));
}
#[test]
fn non_junk_real_result_kept() {
let r = row(
"https://www.tripadvisor.com/Restaurants-Belgrade.html",
"THE 10 BEST Restaurants in Belgrade",
"best restaurants in belgrade serbia",
vec![1],
);
assert!(!is_junk(&r));
}
#[test]
fn dedupe_by_registrable_domain() {
let rows = vec![
row("https://a.com/1", "alpha beta", "alpha beta", vec![1]),
row("https://a.com/2", "alpha beta", "alpha beta", vec![2]),
row("https://b.com/1", "alpha beta", "alpha beta", vec![3]),
];
let out = rerank(&rows, "alpha beta");
let doms: Vec<String> = out.iter().map(|r| registrable(url_of(r))).collect();
assert_eq!(doms, vec!["a.com", "b.com"]);
}
#[test]
fn degrade_never_returns_empty_when_coverage_fails() {
let rows = vec![
row("https://a.com/1", "unrelated", "nothing matches", vec![1]),
row(
"https://b.com/1",
"also unrelated",
"still nothing",
vec![2],
),
];
let out = rerank(&rows, "quantum chromodynamics lattice");
assert_eq!(out.len(), 2);
}
#[test]
fn empty_input_returns_empty() {
let rows: Vec<SearxngResult> = Vec::new();
assert!(rerank(&rows, "anything").is_empty());
}
#[test]
fn junk_never_leaks_through_degrade() {
let rows = vec![
row(
"https://www.merriam-webster.com/dictionary/best",
"best Definition",
"best",
vec![1],
),
row("https://real.com/1", "unrelated", "no match here", vec![2]),
];
let out = rerank(&rows, "quantum chromodynamics");
assert!(out.iter().all(|r| !is_junk(r)));
assert_eq!(out.len(), 1);
}
#[test]
fn relevance_gate_keeps_max_coverage_drops_zero_coverage() {
let rows = vec![
row(
"https://a.com/1",
"pizza in belgrade",
"great pizza belgrade serbia",
vec![1],
),
row(
"https://b.com/1",
"completely unrelated topic",
"nothing here at all",
vec![2],
),
];
let out = rerank_relevance(&rows, "best pizza in belgrade");
let doms: Vec<String> = out.iter().map(|r| registrable(url_of(r))).collect();
assert!(doms.contains(&"a.com".to_string()));
assert!(!doms.contains(&"b.com".to_string()));
}
#[test]
fn relevance_gate_evicts_below_max_coverage() {
let rows = vec![
row(
"https://full.com/1",
"rust async tokio runtime",
"a complete guide to rust async with tokio",
vec![1],
),
row(
"https://partial.com/1",
"rust async runtime guide",
"deep dive into rust async programming",
vec![2],
),
row(
"https://zero.com/1",
"cooking recipes",
"how to bake bread",
vec![3],
),
];
let out = rerank_relevance(&rows, "rust async tokio");
let doms: Vec<String> = out.iter().map(|r| registrable(url_of(r))).collect();
assert!(doms.contains(&"full.com".to_string()));
assert!(
!doms.contains(&"partial.com".to_string()),
"partial-coverage row must be evicted by the hard max-coverage gate"
);
assert!(!doms.contains(&"zero.com".to_string()));
}
#[test]
fn relevance_gate_noop_when_all_rows_equal_coverage() {
let rows = vec![
row(
"https://a.com/1",
"pizza belgrade",
"pizza belgrade",
vec![1],
),
row(
"https://b.com/1",
"pizza belgrade",
"pizza belgrade",
vec![2],
),
row(
"https://c.com/1",
"pizza belgrade",
"pizza belgrade",
vec![3],
),
];
let out = rerank_relevance(&rows, "best pizza in belgrade");
let doms: Vec<String> = out.iter().map(|r| registrable(url_of(r))).collect();
assert_eq!(doms.len(), 3);
assert!(doms.contains(&"a.com".to_string()));
assert!(doms.contains(&"b.com".to_string()));
assert!(doms.contains(&"c.com".to_string()));
}
#[test]
fn relevance_gate_degrade_safe_with_no_important_terms() {
let rows = vec![
row("https://a.com/1", "alpha", "alpha content", vec![1]),
row("https://b.com/1", "beta", "beta content", vec![2]),
];
let out = rerank_relevance(&rows, "the of in and a");
assert!(!out.is_empty());
assert_eq!(out.len(), 2);
}
}