halldyll_core/parse/
links.rs

1//! Links - Link extraction
2
3use ego_tree::NodeId;
4use scraper::{Html, Selector};
5use std::collections::HashSet;
6use url::Url;
7
8use crate::types::document::{LinkType, OutLink};
9use crate::crawl::normalize::UrlNormalizer;
10
11/// Link extractor
12pub struct LinkExtractor {
13    /// URL normalizer
14    normalizer: UrlNormalizer,
15    /// Extract navigation links?
16    extract_navigation: bool,
17    /// Extract footer links?
18    extract_footer: bool,
19}
20
21impl Default for LinkExtractor {
22    fn default() -> Self {
23        Self {
24            normalizer: UrlNormalizer::default(),
25            extract_navigation: true,
26            extract_footer: true,
27        }
28    }
29}
30
31impl LinkExtractor {
32    /// New extractor
33    pub fn new() -> Self {
34        Self::default()
35    }
36
37    /// Configure extraction
38    pub fn with_options(mut self, extract_navigation: bool, extract_footer: bool) -> Self {
39        self.extract_navigation = extract_navigation;
40        self.extract_footer = extract_footer;
41        self
42    }
43
44    /// Extract all links
45    pub fn extract(&self, html: &str, base_url: &Url) -> Vec<OutLink> {
46        let document = Html::parse_document(html);
47        let mut links = Vec::new();
48
49        // Selector for all <a> links
50        let a_selector = Selector::parse("a[href]").unwrap();
51
52        // Selectors to identify context
53        let nav_selector = Selector::parse("nav, header, .navigation, .menu, [role=\"navigation\"]").unwrap();
54        let footer_selector = Selector::parse("footer, .footer, [role=\"contentinfo\"]").unwrap();
55        let aside_selector = Selector::parse("aside, .sidebar, [role=\"complementary\"]").unwrap();
56
57        // IDs of zones to identify
58        let nav_ids: std::collections::HashSet<_> = document.select(&nav_selector).map(|e| e.id()).collect();
59        let footer_ids: std::collections::HashSet<_> = document.select(&footer_selector).map(|e| e.id()).collect();
60        let aside_ids: std::collections::HashSet<_> = document.select(&aside_selector).map(|e| e.id()).collect();
61
62        for element in document.select(&a_selector) {
63            let href = match element.value().attr("href") {
64                Some(h) => h,
65                None => continue,
66            };
67
68            // Ignore empty or javascript: links
69            if href.is_empty() || href.starts_with("javascript:") || href.starts_with("mailto:") || href.starts_with("tel:") {
70                continue;
71            }
72
73            // Resolve URL
74            let url = match base_url.join(href) {
75                Ok(u) => self.normalizer.normalize(&u),
76                Err(_) => continue,
77            };
78
79            // Ignore non-HTTP URLs
80            if !url.scheme().starts_with("http") {
81                continue;
82            }
83
84            // Determine link type
85            let link_type = self.determine_link_type(&element, &nav_ids, &footer_ids, &aside_ids);
86
87            // Filter according to configuration
88            if !self.extract_navigation && link_type == LinkType::Navigation {
89                continue;
90            }
91            if !self.extract_footer && matches!(link_type, LinkType::Navigation) {
92                // Footer links sont souvent de type Navigation
93            }
94
95            // Anchor text
96            let anchor_text = element.text().collect::<Vec<_>>().join(" ").trim().to_string();
97            let anchor_text = if anchor_text.is_empty() { None } else { Some(anchor_text) };
98
99            // Rel attribute
100            let rel = element.value().attr("rel").map(String::from);
101
102            // Internal or external?
103            let is_internal = url.host() == base_url.host();
104
105            links.push(OutLink {
106                url,
107                anchor_text,
108                rel,
109                is_internal,
110                link_type,
111            });
112        }
113
114        // Deduplicate
115        links.sort_by(|a, b| a.url.as_str().cmp(b.url.as_str()));
116        links.dedup_by(|a, b| a.url == b.url);
117
118        links
119    }
120
121    /// Determine link type based on its context
122    fn determine_link_type(
123        &self,
124        element: &scraper::ElementRef,
125        nav_ids: &HashSet<NodeId>,
126        footer_ids: &HashSet<NodeId>,
127        aside_ids: &HashSet<NodeId>,
128    ) -> LinkType {
129        // Check ancestors
130        let mut parent = element.parent();
131        while let Some(p) = parent {
132            if nav_ids.contains(&p.id()) {
133                return LinkType::Navigation;
134            }
135            if footer_ids.contains(&p.id()) {
136                return LinkType::Navigation; // Footer = navigation aussi
137            }
138            if aside_ids.contains(&p.id()) {
139                return LinkType::Other;
140            }
141            parent = p.parent();
142        }
143
144        LinkType::Content
145    }
146
147    /// Extract the canonical link
148    pub fn extract_canonical(&self, html: &str, base_url: &Url) -> Option<Url> {
149        let document = Html::parse_document(html);
150        let selector = Selector::parse(r#"link[rel="canonical"]"#).ok()?;
151        
152        document
153            .select(&selector)
154            .next()
155            .and_then(|el| el.value().attr("href"))
156            .and_then(|href| base_url.join(href).ok())
157            .map(|u| self.normalizer.normalize(&u))
158    }
159
160    /// Extract pagination links
161    pub fn extract_pagination(&self, html: &str, base_url: &Url) -> PaginationLinks {
162        let document = Html::parse_document(html);
163        
164        let next = Selector::parse(r#"link[rel="next"], a[rel="next"]"#)
165            .ok()
166            .and_then(|sel| {
167                document.select(&sel).next()
168                    .and_then(|el| el.value().attr("href"))
169                    .and_then(|href| base_url.join(href).ok())
170            });
171
172        let prev = Selector::parse(r#"link[rel="prev"], a[rel="prev"]"#)
173            .ok()
174            .and_then(|sel| {
175                document.select(&sel).next()
176                    .and_then(|el| el.value().attr("href"))
177                    .and_then(|href| base_url.join(href).ok())
178            });
179
180        PaginationLinks { next, prev }
181    }
182
183    /// Extract hreflang links
184    pub fn extract_hreflang(&self, html: &str, base_url: &Url) -> Vec<HreflangLink> {
185        let document = Html::parse_document(html);
186        let selector = match Selector::parse(r#"link[rel="alternate"][hreflang]"#) {
187            Ok(s) => s,
188            Err(_) => return Vec::new(),
189        };
190
191        document
192            .select(&selector)
193            .filter_map(|el| {
194                let lang = el.value().attr("hreflang")?;
195                let href = el.value().attr("href")?;
196                let url = base_url.join(href).ok()?;
197                Some(HreflangLink {
198                    lang: lang.to_string(),
199                    url,
200                })
201            })
202            .collect()
203    }
204}
205
206/// Pagination links
207#[derive(Debug, Clone)]
208pub struct PaginationLinks {
209    /// Next page URL
210    pub next: Option<Url>,
211    /// Previous page URL
212    pub prev: Option<Url>,
213}
214
215/// Hreflang link for internationalization
216#[derive(Debug, Clone)]
217pub struct HreflangLink {
218    /// Language code (e.g., "en", "fr-FR")
219    pub lang: String,
220    /// URL for this language version
221    pub url: Url,
222}