halldyll_core/crawl/
canonical.rs1use std::collections::HashMap;
4use std::sync::RwLock;
5use url::Url;
6
7pub struct CanonicalResolver {
9 canonical_map: RwLock<HashMap<String, Url>>,
11 respect_canonicals: bool,
13}
14
15impl Default for CanonicalResolver {
16 fn default() -> Self {
17 Self::new(true)
18 }
19}
20
21impl CanonicalResolver {
22 pub fn new(respect_canonicals: bool) -> Self {
24 Self {
25 canonical_map: RwLock::new(HashMap::new()),
26 respect_canonicals,
27 }
28 }
29
30 pub fn register(&self, url: &Url, canonical: &Url) {
32 if !self.respect_canonicals {
33 return;
34 }
35
36 let key = url.to_string();
37 self.canonical_map.write().unwrap().insert(key, canonical.clone());
38 }
39
40 pub fn resolve(&self, url: &Url) -> Url {
42 if !self.respect_canonicals {
43 return url.clone();
44 }
45
46 let key = url.to_string();
47 self.canonical_map
48 .read()
49 .unwrap()
50 .get(&key)
51 .cloned()
52 .unwrap_or_else(|| url.clone())
53 }
54
55 pub fn has_different_canonical(&self, url: &Url) -> bool {
57 let key = url.to_string();
58 if let Some(canonical) = self.canonical_map.read().unwrap().get(&key) {
59 canonical != url
60 } else {
61 false
62 }
63 }
64
65 pub fn count(&self) -> usize {
67 self.canonical_map.read().unwrap().len()
68 }
69
70 pub fn clear(&self) {
72 self.canonical_map.write().unwrap().clear();
73 }
74
75 pub fn resolve_from_html(&self, html: &str, base_url: &Url) -> Option<Url> {
77 if !self.respect_canonicals {
78 return None;
79 }
80
81 extract_canonical_from_html(html, base_url).filter(|canonical| canonical != base_url)
82 }
83}
84
85pub fn extract_canonical_from_html(html: &str, base_url: &Url) -> Option<Url> {
87 let document = scraper::Html::parse_document(html);
89 let selector = scraper::Selector::parse(r#"link[rel="canonical"]"#).ok()?;
90
91 document
92 .select(&selector)
93 .next()
94 .and_then(|el| el.value().attr("href"))
95 .and_then(|href| base_url.join(href).ok())
96}
97
98pub struct PaginationLinks {
100 pub next: Option<Url>,
102 pub prev: Option<Url>,
104}
105
106pub fn extract_pagination_from_html(html: &str, base_url: &Url) -> PaginationLinks {
108 let document = scraper::Html::parse_document(html);
109
110 let next = scraper::Selector::parse(r#"link[rel="next"]"#)
111 .ok()
112 .and_then(|sel| {
113 document
114 .select(&sel)
115 .next()
116 .and_then(|el| el.value().attr("href"))
117 .and_then(|href| base_url.join(href).ok())
118 });
119
120 let prev = scraper::Selector::parse(r#"link[rel="prev"]"#)
121 .ok()
122 .and_then(|sel| {
123 document
124 .select(&sel)
125 .next()
126 .and_then(|el| el.value().attr("href"))
127 .and_then(|href| base_url.join(href).ok())
128 });
129
130 PaginationLinks { next, prev }
131}
132
133pub fn extract_hreflang_from_html(html: &str, base_url: &Url) -> HashMap<String, Url> {
135 let document = scraper::Html::parse_document(html);
136 let selector = match scraper::Selector::parse(r#"link[rel="alternate"][hreflang]"#) {
137 Ok(s) => s,
138 Err(_) => return HashMap::new(),
139 };
140
141 document
142 .select(&selector)
143 .filter_map(|el| {
144 let lang = el.value().attr("hreflang")?;
145 let href = el.value().attr("href")?;
146 let url = base_url.join(href).ok()?;
147 Some((lang.to_string(), url))
148 })
149 .collect()
150}