use regex::Regex;
use serde::{Deserialize, Serialize};
use std::cmp::Ordering;
use url::Url;
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct PrioritizedUrl {
pub url: String,
pub priority: i32,
pub depth: u32,
}
impl PrioritizedUrl {
pub fn new(url: String, depth: u32, prioritizer: &UrlPrioritizer) -> Self {
let priority = prioritizer.calculate_priority(&url, depth);
Self {
url,
priority,
depth,
}
}
}
impl Ord for PrioritizedUrl {
fn cmp(&self, other: &Self) -> Ordering {
match self.priority.cmp(&other.priority) {
Ordering::Equal => {
other.depth.cmp(&self.depth)
}
other => other,
}
}
}
impl PartialOrd for PrioritizedUrl {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
#[derive(Debug, Clone)]
pub struct UrlPrioritizer {
high_priority_patterns: Vec<Regex>,
low_priority_patterns: Vec<Regex>,
base_priority: i32,
depth_penalty: i32,
url_length_threshold: usize,
}
impl Default for UrlPrioritizer {
fn default() -> Self {
Self::new()
}
}
impl UrlPrioritizer {
pub fn new() -> Self {
let high_priority_patterns = vec![
Regex::new(r"/(index|main|home|about|contact)(\.|$|/)").unwrap(),
Regex::new(r"/(docs?|documentation|guide|tutorial|api|reference)(/|$)").unwrap(),
Regex::new(r"/(product|service|feature|pricing|solution)s?(/|$)").unwrap(),
Regex::new(r"/(blog|article|post|news)/[^/]+/?$").unwrap(),
Regex::new(r"/(category|tag|topic)/[^/]+/?$").unwrap(),
];
let low_priority_patterns = vec![
Regex::new(r"/page/([2-9]|\d{2,})(/|$)").unwrap(),
Regex::new(r"[?&]page=([2-9]|\d{2,})(&|$)").unwrap(),
Regex::new(r"/\d{4}/\d{2}").unwrap(),
Regex::new(r"\.(pdf|zip|tar|gz|rar|exe|dmg|pkg)$").unwrap(),
Regex::new(r"\.(jpg|jpeg|png|gif|svg|mp4|mp3|avi|mov|wav)$").unwrap(),
Regex::new(r"#").unwrap(),
Regex::new(r"\?[^?]{50,}$").unwrap(),
Regex::new(r"/(login|logout|admin|account|dashboard|settings|profile)(/|$)").unwrap(),
Regex::new(r"/(comment|reply|respond)").unwrap(),
Regex::new(r"[?&](search|q|query)=").unwrap(),
];
Self {
high_priority_patterns,
low_priority_patterns,
base_priority: 100,
depth_penalty: 10,
url_length_threshold: 100,
}
}
pub fn calculate_priority(&self, url: &str, depth: u32) -> i32 {
let mut priority = self.base_priority;
let parsed_url = Url::parse(url).ok();
if let Some(ref parsed) = parsed_url {
let path = parsed.path();
if path == "/" || path.is_empty() {
priority += 100; } else if path.matches('/').count() == 1 ||
(path.matches('/').count() == 2 && path.ends_with('/')) {
priority += 40; }
}
priority -= (depth as i32) * self.depth_penalty;
for pattern in &self.high_priority_patterns {
if pattern.is_match(url) {
priority += 50;
break; }
}
for pattern in &self.low_priority_patterns {
if pattern.is_match(url) {
priority -= 50;
break; }
}
if let Some(parsed) = parsed_url {
let url_string = parsed.to_string();
if url_string.len() > self.url_length_threshold {
let excess_length = url_string.len() - self.url_length_threshold;
priority -= (excess_length / 10) as i32;
}
}
priority
}
pub fn with_high_priority_pattern(mut self, pattern: &str) -> Result<Self, regex::Error> {
self.high_priority_patterns.push(Regex::new(pattern)?);
Ok(self)
}
pub fn with_low_priority_pattern(mut self, pattern: &str) -> Result<Self, regex::Error> {
self.low_priority_patterns.push(Regex::new(pattern)?);
Ok(self)
}
pub fn with_base_priority(mut self, priority: i32) -> Self {
self.base_priority = priority;
self
}
pub fn with_depth_penalty(mut self, penalty: i32) -> Self {
self.depth_penalty = penalty;
self
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::collections::BinaryHeap;
#[test]
fn test_prioritized_url_ordering() {
let prioritizer = UrlPrioritizer::new();
let url1 = PrioritizedUrl::new("https://example.com/page/10".to_string(), 2, &prioritizer);
let url2 = PrioritizedUrl::new("https://example.com/docs/guide".to_string(), 1, &prioritizer);
let url3 = PrioritizedUrl::new("https://example.com/".to_string(), 0, &prioritizer);
assert!(url3.priority > url2.priority);
assert!(url2.priority > url1.priority);
assert_eq!(url3.cmp(&url2), Ordering::Greater);
assert_eq!(url2.cmp(&url1), Ordering::Greater);
}
#[test]
fn test_binary_heap_ordering() {
let prioritizer = UrlPrioritizer::new();
let mut heap = BinaryHeap::new();
heap.push(PrioritizedUrl::new("https://example.com/page/10".to_string(), 2, &prioritizer));
heap.push(PrioritizedUrl::new("https://example.com/".to_string(), 0, &prioritizer));
heap.push(PrioritizedUrl::new("https://example.com/docs/guide".to_string(), 1, &prioritizer));
heap.push(PrioritizedUrl::new("https://example.com/login".to_string(), 2, &prioritizer));
let first = heap.pop().unwrap();
assert_eq!(first.url, "https://example.com/");
let second = heap.pop().unwrap();
assert_eq!(second.url, "https://example.com/docs/guide");
}
#[test]
fn test_depth_penalty() {
let prioritizer = UrlPrioritizer::new();
let shallow = prioritizer.calculate_priority("https://example.com/page", 0);
let deep = prioritizer.calculate_priority("https://example.com/page", 5);
assert!(shallow > deep);
assert_eq!(shallow - deep, 50); }
#[test]
fn test_high_priority_patterns() {
let prioritizer = UrlPrioritizer::new();
let base_priority = prioritizer.calculate_priority("https://example.com/random", 0);
let docs_priority = prioritizer.calculate_priority("https://example.com/docs/api", 0);
let index_priority = prioritizer.calculate_priority("https://example.com/index.html", 0);
assert!(docs_priority > base_priority);
assert!(index_priority > base_priority);
}
#[test]
fn test_low_priority_patterns() {
let prioritizer = UrlPrioritizer::new();
let base_priority = prioritizer.calculate_priority("https://example.com/article", 0);
let pdf_priority = prioritizer.calculate_priority("https://example.com/doc.pdf", 0);
let page10_priority = prioritizer.calculate_priority("https://example.com/page/10", 0);
let login_priority = prioritizer.calculate_priority("https://example.com/login", 0);
assert!(pdf_priority < base_priority);
assert!(page10_priority < base_priority);
assert!(login_priority < base_priority);
}
#[test]
fn test_url_length_penalty() {
let prioritizer = UrlPrioritizer::new();
let short_url = "https://example.com/page";
let long_url = "https://example.com/very/long/path/with/many/segments/and/parameters?query=value&filter=enabled&sort=desc";
let short_priority = prioritizer.calculate_priority(short_url, 0);
let long_priority = prioritizer.calculate_priority(long_url, 0);
assert!(short_priority > long_priority);
}
#[test]
fn test_root_path_boost() {
let prioritizer = UrlPrioritizer::new();
let root_priority = prioritizer.calculate_priority("https://example.com/", 0);
let subpage_priority = prioritizer.calculate_priority("https://example.com/about", 0);
let deep_priority = prioritizer.calculate_priority("https://example.com/blog/post/123", 0);
assert!(root_priority > subpage_priority);
assert!(subpage_priority > deep_priority);
}
#[test]
fn test_custom_patterns() {
let prioritizer = UrlPrioritizer::new()
.with_high_priority_pattern(r"/important")
.unwrap()
.with_low_priority_pattern(r"/ignore")
.unwrap();
let important_priority = prioritizer.calculate_priority("https://example.com/important/page", 0);
let ignore_priority = prioritizer.calculate_priority("https://example.com/ignore/page", 0);
let normal_priority = prioritizer.calculate_priority("https://example.com/normal/page", 0);
assert!(important_priority > normal_priority);
assert!(ignore_priority < normal_priority);
}
#[test]
fn test_pagination_detection() {
let prioritizer = UrlPrioritizer::new();
let page1_priority = prioritizer.calculate_priority("https://example.com/blog", 0);
let page2_priority = prioritizer.calculate_priority("https://example.com/blog/page/2", 0);
let page10_priority = prioritizer.calculate_priority("https://example.com/blog/page/10", 0);
assert!(page1_priority > page2_priority);
assert!(page1_priority > page10_priority);
assert_eq!(page2_priority, page10_priority); }
#[test]
fn test_equal_priority_depth_tiebreaker() {
let prioritizer = UrlPrioritizer::new();
let shallow = PrioritizedUrl::new("https://example.com/random1".to_string(), 1, &prioritizer);
let deep = PrioritizedUrl::new("https://example.com/random2".to_string(), 3, &prioritizer);
assert!(shallow.depth < deep.depth);
}
}