halldyll_core/parse/
links.rs1use ego_tree::NodeId;
4use scraper::{Html, Selector};
5use std::collections::HashSet;
6use url::Url;
7
8use crate::types::document::{LinkType, OutLink};
9use crate::crawl::normalize::UrlNormalizer;
10
11pub struct LinkExtractor {
13 normalizer: UrlNormalizer,
15 extract_navigation: bool,
17 extract_footer: bool,
19}
20
21impl Default for LinkExtractor {
22 fn default() -> Self {
23 Self {
24 normalizer: UrlNormalizer::default(),
25 extract_navigation: true,
26 extract_footer: true,
27 }
28 }
29}
30
31impl LinkExtractor {
32 pub fn new() -> Self {
34 Self::default()
35 }
36
37 pub fn with_options(mut self, extract_navigation: bool, extract_footer: bool) -> Self {
39 self.extract_navigation = extract_navigation;
40 self.extract_footer = extract_footer;
41 self
42 }
43
44 pub fn extract(&self, html: &str, base_url: &Url) -> Vec<OutLink> {
46 let document = Html::parse_document(html);
47 let mut links = Vec::new();
48
49 let a_selector = Selector::parse("a[href]").unwrap();
51
52 let nav_selector = Selector::parse("nav, header, .navigation, .menu, [role=\"navigation\"]").unwrap();
54 let footer_selector = Selector::parse("footer, .footer, [role=\"contentinfo\"]").unwrap();
55 let aside_selector = Selector::parse("aside, .sidebar, [role=\"complementary\"]").unwrap();
56
57 let nav_ids: std::collections::HashSet<_> = document.select(&nav_selector).map(|e| e.id()).collect();
59 let footer_ids: std::collections::HashSet<_> = document.select(&footer_selector).map(|e| e.id()).collect();
60 let aside_ids: std::collections::HashSet<_> = document.select(&aside_selector).map(|e| e.id()).collect();
61
62 for element in document.select(&a_selector) {
63 let href = match element.value().attr("href") {
64 Some(h) => h,
65 None => continue,
66 };
67
68 if href.is_empty() || href.starts_with("javascript:") || href.starts_with("mailto:") || href.starts_with("tel:") {
70 continue;
71 }
72
73 let url = match base_url.join(href) {
75 Ok(u) => self.normalizer.normalize(&u),
76 Err(_) => continue,
77 };
78
79 if !url.scheme().starts_with("http") {
81 continue;
82 }
83
84 let link_type = self.determine_link_type(&element, &nav_ids, &footer_ids, &aside_ids);
86
87 if !self.extract_navigation && link_type == LinkType::Navigation {
89 continue;
90 }
91 if !self.extract_footer && matches!(link_type, LinkType::Navigation) {
92 }
94
95 let anchor_text = element.text().collect::<Vec<_>>().join(" ").trim().to_string();
97 let anchor_text = if anchor_text.is_empty() { None } else { Some(anchor_text) };
98
99 let rel = element.value().attr("rel").map(String::from);
101
102 let is_internal = url.host() == base_url.host();
104
105 links.push(OutLink {
106 url,
107 anchor_text,
108 rel,
109 is_internal,
110 link_type,
111 });
112 }
113
114 links.sort_by(|a, b| a.url.as_str().cmp(b.url.as_str()));
116 links.dedup_by(|a, b| a.url == b.url);
117
118 links
119 }
120
121 fn determine_link_type(
123 &self,
124 element: &scraper::ElementRef,
125 nav_ids: &HashSet<NodeId>,
126 footer_ids: &HashSet<NodeId>,
127 aside_ids: &HashSet<NodeId>,
128 ) -> LinkType {
129 let mut parent = element.parent();
131 while let Some(p) = parent {
132 if nav_ids.contains(&p.id()) {
133 return LinkType::Navigation;
134 }
135 if footer_ids.contains(&p.id()) {
136 return LinkType::Navigation; }
138 if aside_ids.contains(&p.id()) {
139 return LinkType::Other;
140 }
141 parent = p.parent();
142 }
143
144 LinkType::Content
145 }
146
147 pub fn extract_canonical(&self, html: &str, base_url: &Url) -> Option<Url> {
149 let document = Html::parse_document(html);
150 let selector = Selector::parse(r#"link[rel="canonical"]"#).ok()?;
151
152 document
153 .select(&selector)
154 .next()
155 .and_then(|el| el.value().attr("href"))
156 .and_then(|href| base_url.join(href).ok())
157 .map(|u| self.normalizer.normalize(&u))
158 }
159
160 pub fn extract_pagination(&self, html: &str, base_url: &Url) -> PaginationLinks {
162 let document = Html::parse_document(html);
163
164 let next = Selector::parse(r#"link[rel="next"], a[rel="next"]"#)
165 .ok()
166 .and_then(|sel| {
167 document.select(&sel).next()
168 .and_then(|el| el.value().attr("href"))
169 .and_then(|href| base_url.join(href).ok())
170 });
171
172 let prev = Selector::parse(r#"link[rel="prev"], a[rel="prev"]"#)
173 .ok()
174 .and_then(|sel| {
175 document.select(&sel).next()
176 .and_then(|el| el.value().attr("href"))
177 .and_then(|href| base_url.join(href).ok())
178 });
179
180 PaginationLinks { next, prev }
181 }
182
183 pub fn extract_hreflang(&self, html: &str, base_url: &Url) -> Vec<HreflangLink> {
185 let document = Html::parse_document(html);
186 let selector = match Selector::parse(r#"link[rel="alternate"][hreflang]"#) {
187 Ok(s) => s,
188 Err(_) => return Vec::new(),
189 };
190
191 document
192 .select(&selector)
193 .filter_map(|el| {
194 let lang = el.value().attr("hreflang")?;
195 let href = el.value().attr("href")?;
196 let url = base_url.join(href).ok()?;
197 Some(HreflangLink {
198 lang: lang.to_string(),
199 url,
200 })
201 })
202 .collect()
203 }
204}
205
206#[derive(Debug, Clone)]
208pub struct PaginationLinks {
209 pub next: Option<Url>,
211 pub prev: Option<Url>,
213}
214
215#[derive(Debug, Clone)]
217pub struct HreflangLink {
218 pub lang: String,
220 pub url: Url,
222}