1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
use std::collections::{BTreeMap, BTreeSet};
use log::{debug, info};
use reqwest::{Response, Url};
use serde::{ser::SerializeStruct, Serialize};
#[derive(Debug, Default)]
pub struct Record {
pub urls: BTreeMap<Url, usize>,
pub url_ids: BTreeMap<usize, Url>,
pub scrapes: BTreeSet<usize>,
pub fails: BTreeSet<usize>,
pub redirects: BTreeMap<usize, usize>,
}
impl Record {
pub fn check_add_url(&mut self, url: Url) -> Result<usize, usize> {
if let Some(index) = self.urls.get(&url) {
return Err(*index);
}
let index = self.urls.len();
self.urls.insert(url.clone(), index);
self.url_ids.insert(index, url);
Ok(index)
}
pub async fn check_final_url(&mut self, url_id: usize, response: &Response) -> Option<usize> {
let final_url_id = match self.check_add_url(response.url().to_owned()) {
Ok(id) => id,
Err(id) => {
if url_id != id && self.scrapes.contains(&id) {
debug!("{url_id}: already scraped as {id}.");
return None;
}
id
}
};
if url_id != final_url_id {
info!("{url_id} redirected to {url_id}.");
self.redirects.insert(url_id, final_url_id);
}
self.scrapes.insert(final_url_id);
Some(final_url_id)
}
}
impl Serialize for Record {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
let mut seq = serializer.serialize_struct("record", 4)?;
seq.serialize_field("scrapes", &self.scrapes)?;
seq.serialize_field("fails", &self.fails)?;
let urls: BTreeMap<_, _> = self
.urls
.iter()
.map(|(url, id)| (url.to_string(), id))
.collect();
seq.serialize_field("urls", &urls)?;
let redirects: BTreeMap<_, _> = self
.redirects
.iter()
.map(|(before, after)| (before.to_string(), after))
.collect();
seq.serialize_field("redirects", &redirects)?;
seq.end()
}
}