halldyll_core/crawl/
frontier.rs

1//! Frontier - Priority queue for crawling
2
3use std::cmp::Ordering;
4use std::collections::{BinaryHeap, HashMap, HashSet};
5use std::sync::RwLock;
6use url::Url;
7
8use crate::types::config::CrawlStrategy;
9
10/// Entry in the frontier
11#[derive(Debug, Clone)]
12pub struct CrawlEntry {
13    /// URL to crawl
14    pub url: Url,
15    /// Crawl depth
16    pub depth: u32,
17    /// Priority score (higher = more priority)
18    pub priority: i32,
19    /// Parent URL (where this link came from)
20    pub parent_url: Option<Url>,
21    /// Timestamp when added
22    pub added_at: std::time::Instant,
23}
24
25impl CrawlEntry {
26    /// New entry
27    pub fn new(url: Url, depth: u32, priority: i32) -> Self {
28        Self {
29            url,
30            depth,
31            priority,
32            parent_url: None,
33            added_at: std::time::Instant::now(),
34        }
35    }
36
37    /// With parent URL
38    pub fn with_parent(mut self, parent: Url) -> Self {
39        self.parent_url = Some(parent);
40        self
41    }
42}
43
44// Ordonnancement pour BinaryHeap (max-heap)
45impl Eq for CrawlEntry {}
46
47impl PartialEq for CrawlEntry {
48    fn eq(&self, other: &Self) -> bool {
49        self.url == other.url
50    }
51}
52
53impl PartialOrd for CrawlEntry {
54    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
55        Some(self.cmp(other))
56    }
57}
58
59impl Ord for CrawlEntry {
60    fn cmp(&self, other: &Self) -> Ordering {
61        // Priority first, then depth (reversed for BFS)
62        self.priority
63            .cmp(&other.priority)
64            .then_with(|| other.depth.cmp(&self.depth)) // BFS: lower depth first
65    }
66}
67
68/// Thread-safe frontier
69#[allow(dead_code)]
70pub struct Frontier {
71    /// Priority queue
72    queue: RwLock<BinaryHeap<CrawlEntry>>,
73    /// Already seen URLs
74    seen: RwLock<HashSet<String>>,
75    /// Counter per domain
76    domain_counts: RwLock<HashMap<String, u32>>,
77    /// Crawl strategy
78    strategy: CrawlStrategy,
79    /// Max depth
80    max_depth: u32,
81    /// Max URLs per domain
82    max_per_domain: u32,
83    /// Max total URLs
84    max_total: u32,
85}
86
87impl Frontier {
88    /// New frontier
89    pub fn new(strategy: CrawlStrategy, max_depth: u32, max_per_domain: u32, max_total: u32) -> Self {
90        Self {
91            queue: RwLock::new(BinaryHeap::new()),
92            seen: RwLock::new(HashSet::new()),
93            domain_counts: RwLock::new(HashMap::new()),
94            strategy,
95            max_depth,
96            max_per_domain,
97            max_total,
98        }
99    }
100
101    /// Add a URL to the frontier
102    pub fn push(&self, entry: CrawlEntry) -> bool {
103        // Check depth
104        if entry.depth > self.max_depth {
105            return false;
106        }
107
108        let url_key = entry.url.to_string();
109        let domain = entry.url.host_str().unwrap_or("").to_string();
110
111        // Check if already seen
112        {
113            let seen = self.seen.read().unwrap();
114            if seen.contains(&url_key) {
115                return false;
116            }
117        }
118
119        // Check domain quota
120        {
121            let counts = self.domain_counts.read().unwrap();
122            if let Some(&count) = counts.get(&domain) {
123                if count >= self.max_per_domain {
124                    return false;
125                }
126            }
127        }
128
129        // Check total quota
130        {
131            let seen = self.seen.read().unwrap();
132            if seen.len() >= self.max_total as usize {
133                return false;
134            }
135        }
136
137        // Add
138        {
139            let mut seen = self.seen.write().unwrap();
140            let mut queue = self.queue.write().unwrap();
141            let mut counts = self.domain_counts.write().unwrap();
142
143            seen.insert(url_key);
144            queue.push(entry);
145            *counts.entry(domain).or_insert(0) += 1;
146        }
147
148        true
149    }
150
151    /// Add multiple URLs
152    pub fn push_many(&self, entries: Vec<CrawlEntry>) -> usize {
153        entries.into_iter().filter(|e| self.push(e.clone())).count()
154    }
155
156    /// Get the next URL
157    pub fn pop(&self) -> Option<CrawlEntry> {
158        let mut queue = self.queue.write().unwrap();
159        queue.pop()
160    }
161
162    /// Queue size
163    pub fn len(&self) -> usize {
164        self.queue.read().unwrap().len()
165    }
166
167    /// Is queue empty?
168    pub fn is_empty(&self) -> bool {
169        self.queue.read().unwrap().is_empty()
170    }
171
172    /// Number of seen URLs
173    pub fn seen_count(&self) -> usize {
174        self.seen.read().unwrap().len()
175    }
176
177    /// URL already seen?
178    pub fn has_seen(&self, url: &Url) -> bool {
179        self.seen.read().unwrap().contains(&url.to_string())
180    }
181
182    /// Mark a URL as seen without adding it to the queue
183    pub fn mark_seen(&self, url: &Url) {
184        self.seen.write().unwrap().insert(url.to_string());
185    }
186
187    /// Clear the frontier
188    pub fn clear(&self) {
189        let mut queue = self.queue.write().unwrap();
190        let mut seen = self.seen.write().unwrap();
191        let mut counts = self.domain_counts.write().unwrap();
192        queue.clear();
193        seen.clear();
194        counts.clear();
195    }
196
197    /// Stats per domain
198    pub fn domain_stats(&self) -> HashMap<String, u32> {
199        self.domain_counts.read().unwrap().clone()
200    }
201}