wdict/collections/
urldb.rs

1use std::collections::HashMap;
2use std::sync::{Arc, Mutex, MutexGuard};
3
4/// Stores urls, tracking whether or not they have been visited.
5#[derive(Debug)]
6pub struct UrlDb(Arc<Mutex<HashMap<String, Status>>>);
7
8impl Clone for UrlDb {
9    /// Returns a clone/handle of the given UrlDb.
10    fn clone(&self) -> Self {
11        UrlDb(Arc::clone(&self.0))
12    }
13}
14
15impl UrlDb {
16    /// Returns a new UrlDb instance.
17    pub fn new() -> Self {
18        UrlDb(Arc::new(Mutex::new(HashMap::new())))
19    }
20
21    /// Returns an iterator over the urls that were discovered and visited.
22    pub fn visited_urls_iter(&self) -> impl Iterator<Item = String> {
23        let hm: MutexGuard<HashMap<String, Status>> = match self.0.lock() {
24            Ok(guard) => guard,
25            Err(poisoned) => poisoned.into_inner(),
26        };
27        hm.clone()
28            .into_iter()
29            .filter(|(_k, v)| *v == Status::Visited)
30            .map(|(k, _v)| k)
31    }
32
33    /// Returns an iterator over the urls that are currently staged.
34    pub fn staged_urls_iter(&self) -> impl Iterator<Item = String> {
35        let hm: MutexGuard<HashMap<String, Status>> = match self.0.lock() {
36            Ok(guard) => guard,
37            Err(poisoned) => poisoned.into_inner(),
38        };
39        hm.clone()
40            .into_iter()
41            .filter(|(_k, v)| *v == Status::Staged)
42            .map(|(k, _v)| k)
43    }
44
45    /// Returns an iterator over the urls that were discovered, but unvisited.
46    pub fn unvisited_urls_iter(&self) -> impl Iterator<Item = String> {
47        let hm: MutexGuard<HashMap<String, Status>> = match self.0.lock() {
48            Ok(guard) => guard,
49            Err(poisoned) => poisoned.into_inner(),
50        };
51        hm.clone()
52            .into_iter()
53            .filter(|(_k, v)| *v == Status::Unvisited)
54            .map(|(k, _v)| k)
55    }
56
57    /// Returns an iterator over the urls that were discovered, but skipped.
58    pub fn skipped_urls_iter(&self) -> impl Iterator<Item = String> {
59        let hm: MutexGuard<HashMap<String, Status>> = match self.0.lock() {
60            Ok(guard) => guard,
61            Err(poisoned) => poisoned.into_inner(),
62        };
63        hm.clone()
64            .into_iter()
65            .filter(|(_k, v)| *v == Status::Skip)
66            .map(|(k, _v)| k)
67    }
68
69    /// Returns an iterator over the urls that were discovered, but encountered and error while
70    /// visiting.
71    pub fn errored_urls_iter(&self) -> impl Iterator<Item = String> {
72        let hm: MutexGuard<HashMap<String, Status>> = match self.0.lock() {
73            Ok(guard) => guard,
74            Err(poisoned) => poisoned.into_inner(),
75        };
76        hm.clone()
77            .into_iter()
78            .filter(|(_k, v)| *v == Status::Error)
79            .map(|(k, _v)| k)
80    }
81
82    /// Returns the number of urls that were visited.
83    pub fn num_visited_urls(&self) -> usize {
84        self.visited_urls_iter().count()
85    }
86
87    /// Returns the number of urls that were staged.
88    pub fn num_staged_urls(&self) -> usize {
89        self.staged_urls_iter().count()
90    }
91
92    /// Returns the number of urls that were unvisited.
93    pub fn num_unvisited_urls(&self) -> usize {
94        self.unvisited_urls_iter().count()
95    }
96
97    /// Returns the number of urls that were skipped.
98    pub fn num_skipped_urls(&self) -> usize {
99        self.skipped_urls_iter().count()
100    }
101
102    /// Returns the number of urls that encountered an error.
103    pub fn num_errored_urls(&self) -> usize {
104        self.errored_urls_iter().count()
105    }
106
107    /// Inserts and marks a url as visited.
108    pub fn mark_visited(&mut self, url: &str) -> () {
109        let mut hm: MutexGuard<HashMap<String, Status>> = match self.0.lock() {
110            Ok(guard) => guard,
111            Err(poisoned) => poisoned.into_inner(),
112        };
113        hm.insert(url.to_owned(), Status::Visited);
114    }
115
116    /// Inserts and marks a url as staged.
117    pub fn mark_staged(&mut self, url: &str) -> () {
118        let mut hm: MutexGuard<HashMap<String, Status>> = match self.0.lock() {
119            Ok(guard) => guard,
120            Err(poisoned) => poisoned.into_inner(),
121        };
122        hm.insert(url.to_owned(), Status::Staged);
123    }
124
125    /// Inserts and marks a url as unvisited.
126    pub fn mark_unvisited(&mut self, url: &str) -> () {
127        let mut hm: MutexGuard<HashMap<String, Status>> = match self.0.lock() {
128            Ok(guard) => guard,
129            Err(poisoned) => poisoned.into_inner(),
130        };
131        hm.insert(url.to_owned(), Status::Unvisited);
132    }
133
134    /// Inserts and marks a url as skipped.
135    pub fn mark_skipped(&mut self, url: &str) -> () {
136        let mut hm: MutexGuard<HashMap<String, Status>> = match self.0.lock() {
137            Ok(guard) => guard,
138            Err(poisoned) => poisoned.into_inner(),
139        };
140        hm.insert(url.to_owned(), Status::Skip);
141    }
142
143    /// Inserts and marks a url as errored.
144    pub fn mark_errored(&mut self, url: &str) -> () {
145        let mut hm: MutexGuard<HashMap<String, Status>> = match self.0.lock() {
146            Ok(guard) => guard,
147            Err(poisoned) => poisoned.into_inner(),
148        };
149        hm.insert(url.to_owned(), Status::Error);
150    }
151
152    /// Inserts and marks a url as visited, only if the url is new.
153    pub fn cond_mark_visited(&mut self, url: &str) -> () {
154        let mut hm: MutexGuard<HashMap<String, Status>> = match self.0.lock() {
155            Ok(guard) => guard,
156            Err(poisoned) => poisoned.into_inner(),
157        };
158        hm.entry(url.to_owned()).or_insert(Status::Visited);
159    }
160
161    /// Inserts and marks a url as staged, only if the url is new.
162    pub fn cond_mark_staged(&mut self, url: &str) -> () {
163        let mut hm: MutexGuard<HashMap<String, Status>> = match self.0.lock() {
164            Ok(guard) => guard,
165            Err(poisoned) => poisoned.into_inner(),
166        };
167        hm.entry(url.to_owned()).or_insert(Status::Staged);
168    }
169
170    /// Inserts and marks a url as unvisited, only if the url is new.
171    pub fn cond_mark_unvisited(&mut self, url: &str) -> () {
172        let mut hm: MutexGuard<HashMap<String, Status>> = match self.0.lock() {
173            Ok(guard) => guard,
174            Err(poisoned) => poisoned.into_inner(),
175        };
176        hm.entry(url.to_owned()).or_insert(Status::Unvisited);
177    }
178
179    /// Inserts and marks a url as skipped, only if the url is new.
180    pub fn cond_mark_skipped(&mut self, url: &str) -> () {
181        let mut hm: MutexGuard<HashMap<String, Status>> = match self.0.lock() {
182            Ok(guard) => guard,
183            Err(poisoned) => poisoned.into_inner(),
184        };
185        hm.entry(url.to_owned()).or_insert(Status::Skip);
186    }
187
188    /// Inserts and marks a url as errored, only if the url is new.
189    pub fn cond_mark_errored(&mut self, url: &str) -> () {
190        let mut hm: MutexGuard<HashMap<String, Status>> = match self.0.lock() {
191            Ok(guard) => guard,
192            Err(poisoned) => poisoned.into_inner(),
193        };
194        hm.entry(url.to_owned()).or_insert(Status::Error);
195    }
196
197    /// Move all unvisited urls onto the stage.
198    pub fn stage_unvisited_urls(&mut self) {
199        let mut hm: MutexGuard<HashMap<String, Status>> = match self.0.lock() {
200            Ok(guard) => guard,
201            Err(poisoned) => poisoned.into_inner(),
202        };
203        for (k, _v) in hm
204            .clone()
205            .into_iter()
206            .filter(|(_k, v)| *v == Status::Unvisited)
207        {
208            hm.insert(k, Status::Staged);
209        }
210    }
211}
212
213#[derive(Copy, Debug, Clone)]
214enum Status {
215    /// A Url that was already visited successfully.
216    Visited,
217    /// A Url that has not yet been visited, but is about to be.
218    Staged,
219    /// A newly discovered Url that has not yet been processed or visited.
220    Unvisited,
221    /// A Url that has been determined to be skipped; ultimately will not be visted.
222    Skip,
223    /// A Url of which an error was encountered while attempting to visit.
224    Error,
225}
226
227impl PartialEq for Status {
228    fn eq(&self, other: &Self) -> bool {
229        match (self, other) {
230            (Self::Visited, Self::Visited) => true,
231            (Self::Staged, Self::Staged) => true,
232            (Self::Unvisited, Self::Unvisited) => true,
233            (Self::Skip, Self::Skip) => true,
234            (Self::Error, Self::Error) => true,
235            _ => false,
236        }
237    }
238}