use case_insensitive_string::compact_str::CompactString;
use case_insensitive_string::CaseInsensitiveString;
use hashbrown::HashSet;
use std::cmp::Ordering as CmpOrdering;
use std::collections::BinaryHeap;
use std::collections::VecDeque;
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct ScoredUrl {
pub priority: i32,
pub url: CaseInsensitiveString,
}
impl Ord for ScoredUrl {
fn cmp(&self, other: &Self) -> CmpOrdering {
self.priority
.cmp(&other.priority)
.then_with(|| self.url.cmp(&other.url))
}
}
impl PartialOrd for ScoredUrl {
fn partial_cmp(&self, other: &Self) -> Option<CmpOrdering> {
Some(self.cmp(other))
}
}
pub struct UrlFrontier {
heap: BinaryHeap<ScoredUrl>,
visited: HashSet<CompactString>,
round_robin: bool,
last_domain: Option<CompactString>,
rr_buf: VecDeque<ScoredUrl>,
}
impl UrlFrontier {
pub fn new(round_robin: bool) -> Self {
Self {
heap: BinaryHeap::new(),
visited: HashSet::new(),
round_robin,
last_domain: None,
rr_buf: VecDeque::new(),
}
}
pub fn push(&mut self, url: CaseInsensitiveString, priority: i32) -> bool {
let key = CompactString::new(url.inner());
if !self.visited.insert(key) {
return false;
}
self.heap.push(ScoredUrl { priority, url });
true
}
pub fn pop(&mut self) -> Option<CaseInsensitiveString> {
if !self.round_robin {
return self.heap.pop().map(|s| s.url);
}
let last = self.last_domain.as_deref();
let mut found: Option<ScoredUrl> = None;
while let Some(entry) = self.heap.pop() {
let domain = extract_domain(entry.url.inner());
let same = match last {
Some(prev) => domain.as_str() == prev,
None => false,
};
if same && found.is_none() {
self.rr_buf.push_back(entry);
} else {
found = Some(entry);
break;
}
}
if found.is_none() {
found = self.rr_buf.pop_front();
}
for item in self.rr_buf.drain(..) {
self.heap.push(item);
}
if let Some(ref entry) = found {
self.last_domain = Some(extract_domain(entry.url.inner()));
}
found.map(|s| s.url)
}
pub fn extend_with_priority(
&mut self,
urls: impl Iterator<Item = CaseInsensitiveString>,
priority: i32,
) {
for url in urls {
self.push(url, priority);
}
}
#[inline]
pub fn len(&self) -> usize {
self.heap.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.heap.is_empty()
}
}
const HIGH_VALUE: &[&str] = &["product", "article", "item", "page"];
const LOW_VALUE: &[&str] = &["legal", "privacy", "terms", "cookie", "disclaimer"];
pub fn score_url(url: &str, depth: u32) -> i32 {
let base: i32 = 1000i32.saturating_sub((depth as i32).saturating_mul(100));
let path = url_path(url);
let mut score = base;
for seg in HIGH_VALUE {
if contains_ignore_ascii_case(path, seg) {
score = score.saturating_add(50);
}
}
for seg in LOW_VALUE {
if contains_ignore_ascii_case(path, seg) {
score = score.saturating_sub(200);
}
}
score.clamp(0, 2000)
}
fn extract_domain(url: &str) -> CompactString {
if let Some(start) = url.find("://") {
let after = start + 3;
let rest = &url[after..];
let end = rest.find('/').unwrap_or(rest.len());
let host = &rest[..end];
let host = host.split(':').next().unwrap_or(host);
CompactString::new(host)
} else {
CompactString::default()
}
}
fn url_path(url: &str) -> &str {
if let Some(start) = url.find("://") {
let after = start + 3;
let rest = &url[after..];
if let Some(slash) = rest.find('/') {
let path_start = after + slash;
let remaining = &url[path_start..];
let end = remaining
.find('?')
.unwrap_or_else(|| remaining.find('#').unwrap_or(remaining.len()));
&remaining[..end]
} else {
"/"
}
} else {
url
}
}
#[inline]
fn contains_ignore_ascii_case(haystack: &str, needle: &str) -> bool {
if needle.len() > haystack.len() {
return false;
}
let hb = haystack.as_bytes();
let nb = needle.as_bytes();
for i in 0..=(hb.len() - nb.len()) {
if hb[i..i + nb.len()]
.iter()
.zip(nb.iter())
.all(|(h, n)| h.eq_ignore_ascii_case(n))
{
return true;
}
}
false
}
#[cfg(test)]
mod tests {
use super::*;
fn cis(s: &str) -> CaseInsensitiveString {
CaseInsensitiveString::from(s)
}
#[test]
fn push_dedup() {
let mut f = UrlFrontier::new(false);
assert!(f.push(cis("https://example.com/a"), 100));
assert!(!f.push(cis("https://example.com/a"), 200)); assert_eq!(f.len(), 1);
}
#[test]
fn pop_highest_priority_first() {
let mut f = UrlFrontier::new(false);
f.push(cis("https://example.com/low"), 10);
f.push(cis("https://example.com/high"), 500);
f.push(cis("https://example.com/mid"), 100);
let first = f.pop().unwrap();
assert_eq!(first, cis("https://example.com/high"));
let second = f.pop().unwrap();
assert_eq!(second, cis("https://example.com/mid"));
}
#[test]
fn extend_with_priority_bulk() {
let mut f = UrlFrontier::new(false);
let urls = vec![
cis("https://a.com/1"),
cis("https://b.com/2"),
cis("https://a.com/1"), ];
f.extend_with_priority(urls.into_iter(), 50);
assert_eq!(f.len(), 2);
}
#[test]
fn score_url_depth_and_segments() {
let s = score_url("https://shop.com/product/widget", 0);
assert_eq!(s, 1050);
let s = score_url("https://shop.com/legal/privacy", 0);
assert_eq!(s, 600);
let s = score_url("https://shop.com/deep", 15);
assert_eq!(s, 0);
}
#[test]
fn round_robin_alternates_domains() {
let mut f = UrlFrontier::new(true);
f.push(cis("https://a.com/1"), 100);
f.push(cis("https://a.com/2"), 90);
f.push(cis("https://b.com/1"), 95);
let first = f.pop().unwrap();
assert_eq!(first, cis("https://a.com/1"));
let second = f.pop().unwrap();
assert_eq!(second, cis("https://b.com/1"));
let third = f.pop().unwrap();
assert_eq!(third, cis("https://a.com/2"));
}
#[test]
fn pop_empty_returns_none() {
let mut f = UrlFrontier::new(false);
assert!(f.pop().is_none());
assert!(f.is_empty());
}
#[test]
fn extract_domain_various() {
assert_eq!(
extract_domain("https://www.example.com/path"),
CompactString::new("www.example.com")
);
assert_eq!(
extract_domain("http://localhost:8080/test"),
CompactString::new("localhost")
);
assert_eq!(extract_domain("no-scheme"), CompactString::default());
}
#[test]
fn score_url_clamped() {
let s = score_url("https://x.com/product/article/item/page", 0);
assert_eq!(s, 1200);
let s = score_url("https://x.com/legal", 20);
assert_eq!(s, 0);
}
}