1use scraper::{Html, ElementRef};
11use std::collections::HashSet;
12use url::Url;
13
14use crate::selector::SELECTORS;
15use crate::types::{Link, LinkRel, LinkType, ParserConfig, ParserResult};
16
17pub fn extract_links(
23 document: &Html,
24 config: &ParserConfig,
25) -> ParserResult<Vec<Link>> {
26 let mut links = Vec::new();
27 let mut seen_hrefs: HashSet<String> = HashSet::new();
28
29 for anchor in document.select(&SELECTORS.a) {
30 if let Some(link) = extract_link(&anchor, config.base_url.as_ref()) {
31 if !seen_hrefs.contains(&link.href) {
33 seen_hrefs.insert(link.href.clone());
34 links.push(link);
35 }
36 }
37 }
38
39 Ok(links)
40}
41
42pub fn extract_link(element: &ElementRef, base_url: Option<&Url>) -> Option<Link> {
44 let href = element.value().attr("href")?;
45 let href = href.trim();
46
47 if href.is_empty()
49 || href.starts_with("javascript:")
50 || href.starts_with("mailto:")
51 || href.starts_with("tel:")
52 || href.starts_with("data:")
53 || href == "#"
54 {
55 return None;
56 }
57
58 let text = element.text().collect::<String>().trim().to_string();
60
61 let mut link = Link::new(href, &text);
63
64 link.url = resolve_url(href, base_url);
66
67 if let Some(rel) = element.value().attr("rel") {
69 link.rel = parse_rel_attribute(rel);
70 link.is_nofollow = link.rel.contains(&LinkRel::NoFollow);
71 }
72
73 link.title = element.value().attr("title").map(|s| s.to_string());
75 link.target = element.value().attr("target").map(|s| s.to_string());
76 link.hreflang = element.value().attr("hreflang").map(|s| s.to_string());
77
78 link.link_type = determine_link_type(&link.url, base_url);
80
81 Some(link)
82}
83
84pub fn resolve_url(href: &str, base_url: Option<&Url>) -> Option<String> {
90 let trimmed = href.trim();
91
92 if trimmed.is_empty() {
93 return None;
94 }
95
96 if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
98 return normalize_url(trimmed);
99 }
100
101 if trimmed.starts_with("//") {
103 return normalize_url(&format!("https:{}", trimmed));
104 }
105
106 base_url
108 .and_then(|base| base.join(trimmed).ok())
109 .and_then(|u| normalize_url(u.as_str()))
110}
111
112pub fn normalize_url(url: &str) -> Option<String> {
114 Url::parse(url).ok().map(|mut u| {
115 u.set_fragment(None);
117
118 let path = u.path().to_string();
120 if path.len() > 1 && path.ends_with('/') {
121 u.set_path(path.trim_end_matches('/'));
122 }
123
124 u.to_string()
125 })
126}
127
128fn determine_link_type(resolved_url: &Option<String>, base_url: Option<&Url>) -> LinkType {
130 let (Some(url_str), Some(base)) = (resolved_url, base_url) else {
131 return LinkType::Unknown;
132 };
133
134 let Ok(url) = Url::parse(url_str) else {
135 return LinkType::Unknown;
136 };
137
138 match (url.host_str(), base.host_str()) {
140 (Some(url_host), Some(base_host)) => {
141 if url_host == base_host {
143 LinkType::Internal
144 } else if url_host.ends_with(&format!(".{}", base_host))
145 || base_host.ends_with(&format!(".{}", url_host)) {
146 LinkType::Internal
148 } else {
149 LinkType::External
150 }
151 }
152 _ => LinkType::Unknown,
153 }
154}
155
156pub fn parse_rel_attribute(rel: &str) -> Vec<LinkRel> {
162 rel.split_whitespace()
163 .map(|r| match r.to_lowercase().as_str() {
164 "nofollow" => LinkRel::NoFollow,
165 "ugc" => LinkRel::Ugc,
166 "sponsored" => LinkRel::Sponsored,
167 "external" => LinkRel::External,
168 "noopener" => LinkRel::NoOpener,
169 "noreferrer" => LinkRel::NoReferrer,
170 _ => LinkRel::Other,
171 })
172 .collect()
173}
174
175pub fn is_nofollow(rel: &str) -> bool {
177 rel.to_lowercase()
178 .split_whitespace()
179 .any(|r| r == "nofollow")
180}
181
182pub fn is_sponsored(rel: &str) -> bool {
184 rel.to_lowercase()
185 .split_whitespace()
186 .any(|r| r == "sponsored")
187}
188
189pub fn is_ugc(rel: &str) -> bool {
191 rel.to_lowercase()
192 .split_whitespace()
193 .any(|r| r == "ugc")
194}
195
196pub fn filter_internal_links(links: &[Link]) -> Vec<&Link> {
202 links.iter()
203 .filter(|l| l.link_type == LinkType::Internal)
204 .collect()
205}
206
207pub fn filter_external_links(links: &[Link]) -> Vec<&Link> {
209 links.iter()
210 .filter(|l| l.link_type == LinkType::External)
211 .collect()
212}
213
214pub fn filter_followable_links(links: &[Link]) -> Vec<&Link> {
216 links.iter()
217 .filter(|l| l.should_follow())
218 .collect()
219}
220
221pub fn get_external_domains(links: &[Link]) -> HashSet<String> {
223 links.iter()
224 .filter(|l| l.link_type == LinkType::External)
225 .filter_map(|l| l.url.as_ref())
226 .filter_map(|url| Url::parse(url).ok())
227 .filter_map(|url| url.host_str().map(|h| h.to_string()))
228 .collect()
229}
230
231pub struct LinkStats {
233 pub total: usize,
234 pub internal: usize,
235 pub external: usize,
236 pub nofollow: usize,
237 pub sponsored: usize,
238 pub ugc: usize,
239 pub with_title: usize,
240 pub opens_new_tab: usize,
241}
242
243pub fn calculate_link_stats(links: &[Link]) -> LinkStats {
245 LinkStats {
246 total: links.len(),
247 internal: links.iter().filter(|l| l.link_type == LinkType::Internal).count(),
248 external: links.iter().filter(|l| l.link_type == LinkType::External).count(),
249 nofollow: links.iter().filter(|l| l.is_nofollow).count(),
250 sponsored: links.iter().filter(|l| l.rel.contains(&LinkRel::Sponsored)).count(),
251 ugc: links.iter().filter(|l| l.rel.contains(&LinkRel::Ugc)).count(),
252 with_title: links.iter().filter(|l| l.title.is_some()).count(),
253 opens_new_tab: links.iter().filter(|l| l.opens_new_tab()).count(),
254 }
255}
256
257#[cfg(test)]
262mod tests {
263 use super::*;
264
265 fn parse_html(html: &str) -> Html {
266 Html::parse_document(html)
267 }
268
269 #[test]
270 fn test_extract_links_basic() {
271 let doc = parse_html(r#"
272 <html><body>
273 <a href="https://example.com">Example</a>
274 <a href="/page">Internal</a>
275 </body></html>
276 "#);
277 let config = ParserConfig::default();
278 let links = extract_links(&doc, &config).unwrap();
279 assert_eq!(links.len(), 2);
280 }
281
282 #[test]
283 fn test_extract_link_with_attributes() {
284 let doc = parse_html(r#"
285 <a href="https://example.com"
286 title="Example Site"
287 rel="nofollow external"
288 target="_blank">Link</a>
289 "#);
290 let anchor = doc.select(&SELECTORS.a).next().unwrap();
291 let link = extract_link(&anchor, None).unwrap();
292
293 assert_eq!(link.href, "https://example.com");
294 assert_eq!(link.text, "Link");
295 assert_eq!(link.title, Some("Example Site".to_string()));
296 assert!(link.is_nofollow);
297 assert!(link.opens_new_tab());
298 assert!(link.rel.contains(&LinkRel::NoFollow));
299 assert!(link.rel.contains(&LinkRel::External));
300 }
301
302 #[test]
303 fn test_extract_link_skips_javascript() {
304 let doc = parse_html(r#"<a href="javascript:void(0)">Click</a>"#);
305 let anchor = doc.select(&SELECTORS.a).next().unwrap();
306 assert!(extract_link(&anchor, None).is_none());
307 }
308
309 #[test]
310 fn test_extract_link_skips_mailto() {
311 let doc = parse_html(r#"<a href="mailto:test@example.com">Email</a>"#);
312 let anchor = doc.select(&SELECTORS.a).next().unwrap();
313 assert!(extract_link(&anchor, None).is_none());
314 }
315
316 #[test]
317 fn test_extract_link_skips_hash() {
318 let doc = parse_html("<a href=\"#\">Top</a>");
319 let anchor = doc.select(&SELECTORS.a).next().unwrap();
320 assert!(extract_link(&anchor, None).is_none());
321 }
322
323 #[test]
324 fn test_resolve_url_absolute() {
325 assert_eq!(
326 resolve_url("https://example.com/page", None),
327 Some("https://example.com/page".to_string())
328 );
329 }
330
331 #[test]
332 fn test_resolve_url_protocol_relative() {
333 assert_eq!(
334 resolve_url("//example.com/page", None),
335 Some("https://example.com/page".to_string())
336 );
337 }
338
339 #[test]
340 fn test_resolve_url_relative() {
341 let base = Url::parse("https://example.com/dir/").unwrap();
342 assert_eq!(
343 resolve_url("page.html", Some(&base)),
344 Some("https://example.com/dir/page.html".to_string())
345 );
346 }
347
348 #[test]
349 fn test_resolve_url_root_relative() {
350 let base = Url::parse("https://example.com/dir/page").unwrap();
351 assert_eq!(
352 resolve_url("/other", Some(&base)),
353 Some("https://example.com/other".to_string())
354 );
355 }
356
357 #[test]
358 fn test_normalize_url_removes_fragment() {
359 assert_eq!(
360 normalize_url("https://example.com/page#section"),
361 Some("https://example.com/page".to_string())
362 );
363 }
364
365 #[test]
366 fn test_determine_link_type_internal() {
367 let base = Url::parse("https://example.com").unwrap();
368 let url = Some("https://example.com/page".to_string());
369 assert_eq!(determine_link_type(&url, Some(&base)), LinkType::Internal);
370 }
371
372 #[test]
373 fn test_determine_link_type_subdomain() {
374 let base = Url::parse("https://example.com").unwrap();
375 let url = Some("https://blog.example.com/page".to_string());
376 assert_eq!(determine_link_type(&url, Some(&base)), LinkType::Internal);
377 }
378
379 #[test]
380 fn test_determine_link_type_external() {
381 let base = Url::parse("https://example.com").unwrap();
382 let url = Some("https://other.com/page".to_string());
383 assert_eq!(determine_link_type(&url, Some(&base)), LinkType::External);
384 }
385
386 #[test]
387 fn test_parse_rel_attribute() {
388 let rels = parse_rel_attribute("nofollow ugc sponsored");
389 assert!(rels.contains(&LinkRel::NoFollow));
390 assert!(rels.contains(&LinkRel::Ugc));
391 assert!(rels.contains(&LinkRel::Sponsored));
392 }
393
394 #[test]
395 fn test_is_nofollow() {
396 assert!(is_nofollow("nofollow"));
397 assert!(is_nofollow("nofollow external"));
398 assert!(is_nofollow("external nofollow"));
399 assert!(!is_nofollow("external"));
400 }
401
402 #[test]
403 fn test_filter_internal_links() {
404 let links = vec![
405 Link { link_type: LinkType::Internal, ..Link::new("/a", "A") },
406 Link { link_type: LinkType::External, ..Link::new("https://ext.com", "B") },
407 Link { link_type: LinkType::Internal, ..Link::new("/b", "C") },
408 ];
409 let internal = filter_internal_links(&links);
410 assert_eq!(internal.len(), 2);
411 }
412
413 #[test]
414 fn test_filter_followable_links() {
415 let mut nofollow = Link::new("/page", "Page");
416 nofollow.is_nofollow = true;
417
418 let links = vec![
419 Link::new("/a", "A"),
420 nofollow,
421 Link::new("/b", "B"),
422 ];
423 let followable = filter_followable_links(&links);
424 assert_eq!(followable.len(), 2);
425 }
426
427 #[test]
428 fn test_get_external_domains() {
429 let links = vec![
430 Link {
431 link_type: LinkType::External,
432 url: Some("https://example.com/page".to_string()),
433 ..Link::new("https://example.com/page", "A")
434 },
435 Link {
436 link_type: LinkType::External,
437 url: Some("https://other.com/page".to_string()),
438 ..Link::new("https://other.com/page", "B")
439 },
440 Link {
441 link_type: LinkType::External,
442 url: Some("https://example.com/other".to_string()),
443 ..Link::new("https://example.com/other", "C")
444 },
445 ];
446 let domains = get_external_domains(&links);
447 assert_eq!(domains.len(), 2);
448 assert!(domains.contains("example.com"));
449 assert!(domains.contains("other.com"));
450 }
451
452 #[test]
453 fn test_calculate_link_stats() {
454 let mut nofollow = Link::new("/page", "Page");
455 nofollow.is_nofollow = true;
456 nofollow.link_type = LinkType::Internal;
457
458 let mut sponsored = Link::new("https://ad.com", "Ad");
459 sponsored.rel = vec![LinkRel::Sponsored];
460 sponsored.link_type = LinkType::External;
461
462 let links = vec![
463 Link { link_type: LinkType::Internal, ..Link::new("/a", "A") },
464 nofollow,
465 sponsored,
466 ];
467
468 let stats = calculate_link_stats(&links);
469 assert_eq!(stats.total, 3);
470 assert_eq!(stats.internal, 2);
471 assert_eq!(stats.external, 1);
472 assert_eq!(stats.nofollow, 1);
473 assert_eq!(stats.sponsored, 1);
474 }
475
476 #[test]
477 fn test_deduplicate_links() {
478 let doc = parse_html(r#"
479 <html><body>
480 <a href="https://example.com">First</a>
481 <a href="https://example.com">Duplicate</a>
482 <a href="https://other.com">Other</a>
483 </body></html>
484 "#);
485 let config = ParserConfig::default();
486 let links = extract_links(&doc, &config).unwrap();
487 assert_eq!(links.len(), 2);
488 }
489}