halldyll_core/crawl/
canonical.rs

1//! Canonical - Canonical URL management
2
3use std::collections::HashMap;
4use std::sync::RwLock;
5use url::Url;
6
7/// Canonical resolver
8pub struct CanonicalResolver {
9    /// Map URL -> canonical URL
10    canonical_map: RwLock<HashMap<String, Url>>,
11    /// Respect canonicals?
12    respect_canonicals: bool,
13}
14
15impl Default for CanonicalResolver {
16    fn default() -> Self {
17        Self::new(true)
18    }
19}
20
21impl CanonicalResolver {
22    /// New resolver
23    pub fn new(respect_canonicals: bool) -> Self {
24        Self {
25            canonical_map: RwLock::new(HashMap::new()),
26            respect_canonicals,
27        }
28    }
29
30    /// Register a URL -> canonical mapping
31    pub fn register(&self, url: &Url, canonical: &Url) {
32        if !self.respect_canonicals {
33            return;
34        }
35        
36        let key = url.to_string();
37        self.canonical_map.write().unwrap().insert(key, canonical.clone());
38    }
39
40    /// Resolve the canonical URL
41    pub fn resolve(&self, url: &Url) -> Url {
42        if !self.respect_canonicals {
43            return url.clone();
44        }
45
46        let key = url.to_string();
47        self.canonical_map
48            .read()
49            .unwrap()
50            .get(&key)
51            .cloned()
52            .unwrap_or_else(|| url.clone())
53    }
54
55    /// Does the URL have a different canonical?
56    pub fn has_different_canonical(&self, url: &Url) -> bool {
57        let key = url.to_string();
58        if let Some(canonical) = self.canonical_map.read().unwrap().get(&key) {
59            canonical != url
60        } else {
61            false
62        }
63    }
64
65    /// Number of mappings
66    pub fn count(&self) -> usize {
67        self.canonical_map.read().unwrap().len()
68    }
69
70    /// Clear the cache
71    pub fn clear(&self) {
72        self.canonical_map.write().unwrap().clear();
73    }
74
75    /// Resolve canonical from HTML content (returns None if same as base_url)
76    pub fn resolve_from_html(&self, html: &str, base_url: &Url) -> Option<Url> {
77        if !self.respect_canonicals {
78            return None;
79        }
80
81        extract_canonical_from_html(html, base_url).filter(|canonical| canonical != base_url)
82    }
83}
84
85/// Extract the canonical link from an HTML document
86pub fn extract_canonical_from_html(html: &str, base_url: &Url) -> Option<Url> {
87    // Parse HTML to find <link rel="canonical">
88    let document = scraper::Html::parse_document(html);
89    let selector = scraper::Selector::parse(r#"link[rel="canonical"]"#).ok()?;
90    
91    document
92        .select(&selector)
93        .next()
94        .and_then(|el| el.value().attr("href"))
95        .and_then(|href| base_url.join(href).ok())
96}
97
98/// Extract pagination links (rel=next/prev)
99pub struct PaginationLinks {
100    /// Next page URL from rel="next"
101    pub next: Option<Url>,
102    /// Previous page URL from rel="prev"
103    pub prev: Option<Url>,
104}
105
106/// Extract pagination links
107pub fn extract_pagination_from_html(html: &str, base_url: &Url) -> PaginationLinks {
108    let document = scraper::Html::parse_document(html);
109    
110    let next = scraper::Selector::parse(r#"link[rel="next"]"#)
111        .ok()
112        .and_then(|sel| {
113            document
114                .select(&sel)
115                .next()
116                .and_then(|el| el.value().attr("href"))
117                .and_then(|href| base_url.join(href).ok())
118        });
119
120    let prev = scraper::Selector::parse(r#"link[rel="prev"]"#)
121        .ok()
122        .and_then(|sel| {
123            document
124                .select(&sel)
125                .next()
126                .and_then(|el| el.value().attr("href"))
127                .and_then(|href| base_url.join(href).ok())
128        });
129
130    PaginationLinks { next, prev }
131}
132
133/// Extract hreflang links
134pub fn extract_hreflang_from_html(html: &str, base_url: &Url) -> HashMap<String, Url> {
135    let document = scraper::Html::parse_document(html);
136    let selector = match scraper::Selector::parse(r#"link[rel="alternate"][hreflang]"#) {
137        Ok(s) => s,
138        Err(_) => return HashMap::new(),
139    };
140
141    document
142        .select(&selector)
143        .filter_map(|el| {
144            let lang = el.value().attr("hreflang")?;
145            let href = el.value().attr("href")?;
146            let url = base_url.join(href).ok()?;
147            Some((lang.to_string(), url))
148        })
149        .collect()
150}