use growable_bloom_filter::GrowableBloom;
use parking_lot::Mutex;
use std::collections::HashSet;
use url::Url;
pub struct Dedupe {
bloom: Mutex<GrowableBloom>,
exact_recent: Mutex<HashSet<String>>,
exact_cap: usize,
}
impl Dedupe {
pub fn new(expected: usize, fp_rate: f64) -> Self {
Self {
bloom: Mutex::new(GrowableBloom::new(fp_rate, expected)),
exact_recent: Mutex::new(HashSet::new()),
exact_cap: 100_000,
}
}
pub fn insert_if_new(&self, key: &str) -> bool {
{
let mut recent = self.exact_recent.lock();
if recent.contains(key) {
return false;
}
if recent.len() >= self.exact_cap {
recent.clear();
}
recent.insert(key.to_string());
}
let mut b = self.bloom.lock();
b.insert(key)
}
pub fn insert_url_set(&self, url: &Url) -> bool {
let perms = generate_url_permutations(url);
let any_seen = {
let recent = self.exact_recent.lock();
perms.iter().any(|p| recent.contains(p))
};
if !any_seen {
let b = self.bloom.lock();
let bloom_seen = perms.iter().any(|p| b.contains(p));
if bloom_seen {
} else {
drop(b);
for p in &perms {
self.insert_if_new(p);
}
return true;
}
}
for p in &perms {
self.insert_if_new(p);
}
false
}
}
pub fn generate_url_permutations(url: &Url) -> Vec<String> {
let scheme = url.scheme();
if scheme != "http" && scheme != "https" {
return vec![url.as_str().to_string()];
}
let host = match url.host_str() {
Some(h) => h.to_ascii_lowercase(),
None => return vec![url.as_str().to_string()],
};
let bare = host.trim_start_matches("www.").to_string();
let hosts: Vec<String> = if bare == host {
vec![host.clone(), format!("www.{bare}")]
} else {
vec![host.clone(), bare.clone()]
};
let port = url.port();
let path = url.path();
let path = if path.is_empty() { "/" } else { path };
let query = canonical_query(url);
let path_base = path
.strip_suffix("/index.html")
.or_else(|| path.strip_suffix("/index.htm"))
.or_else(|| path.strip_suffix("/index.php"))
.unwrap_or(path)
.trim_end_matches('/')
.to_string();
let path_base = if path_base.is_empty() {
"".to_string()
} else {
path_base
};
let path_variants: Vec<String> = {
let mut v = vec![
format!("{path_base}/"),
format!("{path_base}/index.html"),
format!("{path_base}/index.htm"),
format!("{path_base}/index.php"),
];
if !path_base.is_empty() {
v.push(path_base.clone());
}
v
};
let schemes = ["http", "https"];
let mut out: Vec<String> = Vec::with_capacity(16);
for s in &schemes {
for h in &hosts {
for p in &path_variants {
let mut u = format!("{s}://{h}");
if let Some(pt) = port {
u.push_str(&format!(":{pt}"));
}
u.push_str(p);
if let Some(q) = query.as_deref() {
u.push('?');
u.push_str(q);
}
out.push(u);
}
}
}
let mut seen = HashSet::new();
out.retain(|u| seen.insert(u.clone()));
out
}
fn canonical_query(url: &Url) -> Option<String> {
let mut pairs: Vec<(String, String)> = url
.query_pairs()
.filter(|(k, _)| !is_tracking_query_key(k))
.map(|(k, v)| (k.into_owned(), v.into_owned()))
.collect();
if pairs.is_empty() {
return None;
}
pairs.sort();
let mut out = url::form_urlencoded::Serializer::new(String::new());
for (k, v) in pairs {
out.append_pair(&k, &v);
}
Some(out.finish())
}
fn is_tracking_query_key(key: &str) -> bool {
let key = key.to_ascii_lowercase();
key.starts_with("utm_") || matches!(key.as_str(), "fbclid" | "gclid" | "mc_cid" | "mc_eid")
}