halldyll_core/crawl/
frontier.rs1use std::cmp::Ordering;
4use std::collections::{BinaryHeap, HashMap, HashSet};
5use std::sync::RwLock;
6use url::Url;
7
8use crate::types::config::CrawlStrategy;
9
10#[derive(Debug, Clone)]
12pub struct CrawlEntry {
13 pub url: Url,
15 pub depth: u32,
17 pub priority: i32,
19 pub parent_url: Option<Url>,
21 pub added_at: std::time::Instant,
23}
24
25impl CrawlEntry {
26 pub fn new(url: Url, depth: u32, priority: i32) -> Self {
28 Self {
29 url,
30 depth,
31 priority,
32 parent_url: None,
33 added_at: std::time::Instant::now(),
34 }
35 }
36
37 pub fn with_parent(mut self, parent: Url) -> Self {
39 self.parent_url = Some(parent);
40 self
41 }
42}
43
44impl Eq for CrawlEntry {}
46
47impl PartialEq for CrawlEntry {
48 fn eq(&self, other: &Self) -> bool {
49 self.url == other.url
50 }
51}
52
53impl PartialOrd for CrawlEntry {
54 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
55 Some(self.cmp(other))
56 }
57}
58
59impl Ord for CrawlEntry {
60 fn cmp(&self, other: &Self) -> Ordering {
61 self.priority
63 .cmp(&other.priority)
64 .then_with(|| other.depth.cmp(&self.depth)) }
66}
67
68#[allow(dead_code)]
70pub struct Frontier {
71 queue: RwLock<BinaryHeap<CrawlEntry>>,
73 seen: RwLock<HashSet<String>>,
75 domain_counts: RwLock<HashMap<String, u32>>,
77 strategy: CrawlStrategy,
79 max_depth: u32,
81 max_per_domain: u32,
83 max_total: u32,
85}
86
87impl Frontier {
88 pub fn new(strategy: CrawlStrategy, max_depth: u32, max_per_domain: u32, max_total: u32) -> Self {
90 Self {
91 queue: RwLock::new(BinaryHeap::new()),
92 seen: RwLock::new(HashSet::new()),
93 domain_counts: RwLock::new(HashMap::new()),
94 strategy,
95 max_depth,
96 max_per_domain,
97 max_total,
98 }
99 }
100
101 pub fn push(&self, entry: CrawlEntry) -> bool {
103 if entry.depth > self.max_depth {
105 return false;
106 }
107
108 let url_key = entry.url.to_string();
109 let domain = entry.url.host_str().unwrap_or("").to_string();
110
111 {
113 let seen = self.seen.read().unwrap();
114 if seen.contains(&url_key) {
115 return false;
116 }
117 }
118
119 {
121 let counts = self.domain_counts.read().unwrap();
122 if let Some(&count) = counts.get(&domain) {
123 if count >= self.max_per_domain {
124 return false;
125 }
126 }
127 }
128
129 {
131 let seen = self.seen.read().unwrap();
132 if seen.len() >= self.max_total as usize {
133 return false;
134 }
135 }
136
137 {
139 let mut seen = self.seen.write().unwrap();
140 let mut queue = self.queue.write().unwrap();
141 let mut counts = self.domain_counts.write().unwrap();
142
143 seen.insert(url_key);
144 queue.push(entry);
145 *counts.entry(domain).or_insert(0) += 1;
146 }
147
148 true
149 }
150
151 pub fn push_many(&self, entries: Vec<CrawlEntry>) -> usize {
153 entries.into_iter().filter(|e| self.push(e.clone())).count()
154 }
155
156 pub fn pop(&self) -> Option<CrawlEntry> {
158 let mut queue = self.queue.write().unwrap();
159 queue.pop()
160 }
161
162 pub fn len(&self) -> usize {
164 self.queue.read().unwrap().len()
165 }
166
167 pub fn is_empty(&self) -> bool {
169 self.queue.read().unwrap().is_empty()
170 }
171
172 pub fn seen_count(&self) -> usize {
174 self.seen.read().unwrap().len()
175 }
176
177 pub fn has_seen(&self, url: &Url) -> bool {
179 self.seen.read().unwrap().contains(&url.to_string())
180 }
181
182 pub fn mark_seen(&self, url: &Url) {
184 self.seen.write().unwrap().insert(url.to_string());
185 }
186
187 pub fn clear(&self) {
189 let mut queue = self.queue.write().unwrap();
190 let mut seen = self.seen.write().unwrap();
191 let mut counts = self.domain_counts.write().unwrap();
192 queue.clear();
193 seen.clear();
194 counts.clear();
195 }
196
197 pub fn domain_stats(&self) -> HashMap<String, u32> {
199 self.domain_counts.read().unwrap().clone()
200 }
201}