halldyll_parser/
links.rs

1//! Link extraction for halldyll-parser
2//!
3//! This module handles:
4//! - Link extraction from anchor tags
5//! - URL normalization and resolution
6//! - Rel attribute parsing (nofollow, ugc, sponsored, etc.)
7//! - Internal/external link classification
8//! - Link deduplication
9
10use scraper::{Html, ElementRef};
11use std::collections::HashSet;
12use url::Url;
13
14use crate::selector::SELECTORS;
15use crate::types::{Link, LinkRel, LinkType, ParserConfig, ParserResult};
16
17// ============================================================================
18// MAIN EXTRACTION
19// ============================================================================
20
21/// Extract all links from an HTML document
22pub fn extract_links(
23    document: &Html,
24    config: &ParserConfig,
25) -> ParserResult<Vec<Link>> {
26    let mut links = Vec::new();
27    let mut seen_hrefs: HashSet<String> = HashSet::new();
28    
29    for anchor in document.select(&SELECTORS.a) {
30        if let Some(link) = extract_link(&anchor, config.base_url.as_ref()) {
31            // Deduplicate by href
32            if !seen_hrefs.contains(&link.href) {
33                seen_hrefs.insert(link.href.clone());
34                links.push(link);
35            }
36        }
37    }
38    
39    Ok(links)
40}
41
42/// Extract a single link from an anchor element
43pub fn extract_link(element: &ElementRef, base_url: Option<&Url>) -> Option<Link> {
44    let href = element.value().attr("href")?;
45    let href = href.trim();
46    
47    // Skip empty, javascript, and mailto links
48    if href.is_empty() 
49        || href.starts_with("javascript:") 
50        || href.starts_with("mailto:")
51        || href.starts_with("tel:")
52        || href.starts_with("data:")
53        || href == "#"
54    {
55        return None;
56    }
57    
58    // Get anchor text
59    let text = element.text().collect::<String>().trim().to_string();
60    
61    // Create link
62    let mut link = Link::new(href, &text);
63    
64    // Resolve URL
65    link.url = resolve_url(href, base_url);
66    
67    // Parse rel attributes
68    if let Some(rel) = element.value().attr("rel") {
69        link.rel = parse_rel_attribute(rel);
70        link.is_nofollow = link.rel.contains(&LinkRel::NoFollow);
71    }
72    
73    // Get other attributes
74    link.title = element.value().attr("title").map(|s| s.to_string());
75    link.target = element.value().attr("target").map(|s| s.to_string());
76    link.hreflang = element.value().attr("hreflang").map(|s| s.to_string());
77    
78    // Determine link type (internal/external)
79    link.link_type = determine_link_type(&link.url, base_url);
80    
81    Some(link)
82}
83
84// ============================================================================
85// URL HANDLING
86// ============================================================================
87
88/// Resolve a relative URL to absolute
89pub fn resolve_url(href: &str, base_url: Option<&Url>) -> Option<String> {
90    let trimmed = href.trim();
91    
92    if trimmed.is_empty() {
93        return None;
94    }
95    
96    // Already absolute
97    if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
98        return normalize_url(trimmed);
99    }
100    
101    // Protocol-relative
102    if trimmed.starts_with("//") {
103        return normalize_url(&format!("https:{}", trimmed));
104    }
105    
106    // Resolve relative to base
107    base_url
108        .and_then(|base| base.join(trimmed).ok())
109        .and_then(|u| normalize_url(u.as_str()))
110}
111
112/// Normalize a URL (remove fragments, trailing slashes for paths)
113pub fn normalize_url(url: &str) -> Option<String> {
114    Url::parse(url).ok().map(|mut u| {
115        // Remove fragment
116        u.set_fragment(None);
117        
118        // Normalize path (remove trailing slash except for root)
119        let path = u.path().to_string();
120        if path.len() > 1 && path.ends_with('/') {
121            u.set_path(path.trim_end_matches('/'));
122        }
123        
124        u.to_string()
125    })
126}
127
128/// Determine if a link is internal or external
129fn determine_link_type(resolved_url: &Option<String>, base_url: Option<&Url>) -> LinkType {
130    let (Some(url_str), Some(base)) = (resolved_url, base_url) else {
131        return LinkType::Unknown;
132    };
133    
134    let Ok(url) = Url::parse(url_str) else {
135        return LinkType::Unknown;
136    };
137    
138    // Compare hosts
139    match (url.host_str(), base.host_str()) {
140        (Some(url_host), Some(base_host)) => {
141            // Check if same domain or subdomain
142            if url_host == base_host {
143                LinkType::Internal
144            } else if url_host.ends_with(&format!(".{}", base_host)) 
145                   || base_host.ends_with(&format!(".{}", url_host)) {
146                // Subdomain relationship
147                LinkType::Internal
148            } else {
149                LinkType::External
150            }
151        }
152        _ => LinkType::Unknown,
153    }
154}
155
156// ============================================================================
157// REL ATTRIBUTE PARSING
158// ============================================================================
159
160/// Parse rel attribute into LinkRel values
161pub fn parse_rel_attribute(rel: &str) -> Vec<LinkRel> {
162    rel.split_whitespace()
163        .map(|r| match r.to_lowercase().as_str() {
164            "nofollow" => LinkRel::NoFollow,
165            "ugc" => LinkRel::Ugc,
166            "sponsored" => LinkRel::Sponsored,
167            "external" => LinkRel::External,
168            "noopener" => LinkRel::NoOpener,
169            "noreferrer" => LinkRel::NoReferrer,
170            _ => LinkRel::Other,
171        })
172        .collect()
173}
174
175/// Check if rel indicates nofollow
176pub fn is_nofollow(rel: &str) -> bool {
177    rel.to_lowercase()
178        .split_whitespace()
179        .any(|r| r == "nofollow")
180}
181
182/// Check if rel indicates sponsored
183pub fn is_sponsored(rel: &str) -> bool {
184    rel.to_lowercase()
185        .split_whitespace()
186        .any(|r| r == "sponsored")
187}
188
189/// Check if rel indicates user-generated content
190pub fn is_ugc(rel: &str) -> bool {
191    rel.to_lowercase()
192        .split_whitespace()
193        .any(|r| r == "ugc")
194}
195
196// ============================================================================
197// LINK ANALYSIS
198// ============================================================================
199
200/// Get all internal links
201pub fn filter_internal_links(links: &[Link]) -> Vec<&Link> {
202    links.iter()
203        .filter(|l| l.link_type == LinkType::Internal)
204        .collect()
205}
206
207/// Get all external links
208pub fn filter_external_links(links: &[Link]) -> Vec<&Link> {
209    links.iter()
210        .filter(|l| l.link_type == LinkType::External)
211        .collect()
212}
213
214/// Get all followable links (not nofollow, not sponsored, not ugc)
215pub fn filter_followable_links(links: &[Link]) -> Vec<&Link> {
216    links.iter()
217        .filter(|l| l.should_follow())
218        .collect()
219}
220
221/// Get unique domains from external links
222pub fn get_external_domains(links: &[Link]) -> HashSet<String> {
223    links.iter()
224        .filter(|l| l.link_type == LinkType::External)
225        .filter_map(|l| l.url.as_ref())
226        .filter_map(|url| Url::parse(url).ok())
227        .filter_map(|url| url.host_str().map(|h| h.to_string()))
228        .collect()
229}
230
231/// Count links by type
232pub struct LinkStats {
233    pub total: usize,
234    pub internal: usize,
235    pub external: usize,
236    pub nofollow: usize,
237    pub sponsored: usize,
238    pub ugc: usize,
239    pub with_title: usize,
240    pub opens_new_tab: usize,
241}
242
243/// Calculate link statistics
244pub fn calculate_link_stats(links: &[Link]) -> LinkStats {
245    LinkStats {
246        total: links.len(),
247        internal: links.iter().filter(|l| l.link_type == LinkType::Internal).count(),
248        external: links.iter().filter(|l| l.link_type == LinkType::External).count(),
249        nofollow: links.iter().filter(|l| l.is_nofollow).count(),
250        sponsored: links.iter().filter(|l| l.rel.contains(&LinkRel::Sponsored)).count(),
251        ugc: links.iter().filter(|l| l.rel.contains(&LinkRel::Ugc)).count(),
252        with_title: links.iter().filter(|l| l.title.is_some()).count(),
253        opens_new_tab: links.iter().filter(|l| l.opens_new_tab()).count(),
254    }
255}
256
257// ============================================================================
258// TESTS
259// ============================================================================
260
261#[cfg(test)]
262mod tests {
263    use super::*;
264
265    fn parse_html(html: &str) -> Html {
266        Html::parse_document(html)
267    }
268
269    #[test]
270    fn test_extract_links_basic() {
271        let doc = parse_html(r#"
272            <html><body>
273                <a href="https://example.com">Example</a>
274                <a href="/page">Internal</a>
275            </body></html>
276        "#);
277        let config = ParserConfig::default();
278        let links = extract_links(&doc, &config).unwrap();
279        assert_eq!(links.len(), 2);
280    }
281
282    #[test]
283    fn test_extract_link_with_attributes() {
284        let doc = parse_html(r#"
285            <a href="https://example.com" 
286               title="Example Site" 
287               rel="nofollow external" 
288               target="_blank">Link</a>
289        "#);
290        let anchor = doc.select(&SELECTORS.a).next().unwrap();
291        let link = extract_link(&anchor, None).unwrap();
292        
293        assert_eq!(link.href, "https://example.com");
294        assert_eq!(link.text, "Link");
295        assert_eq!(link.title, Some("Example Site".to_string()));
296        assert!(link.is_nofollow);
297        assert!(link.opens_new_tab());
298        assert!(link.rel.contains(&LinkRel::NoFollow));
299        assert!(link.rel.contains(&LinkRel::External));
300    }
301
302    #[test]
303    fn test_extract_link_skips_javascript() {
304        let doc = parse_html(r#"<a href="javascript:void(0)">Click</a>"#);
305        let anchor = doc.select(&SELECTORS.a).next().unwrap();
306        assert!(extract_link(&anchor, None).is_none());
307    }
308
309    #[test]
310    fn test_extract_link_skips_mailto() {
311        let doc = parse_html(r#"<a href="mailto:test@example.com">Email</a>"#);
312        let anchor = doc.select(&SELECTORS.a).next().unwrap();
313        assert!(extract_link(&anchor, None).is_none());
314    }
315
316    #[test]
317    fn test_extract_link_skips_hash() {
318        let doc = parse_html("<a href=\"#\">Top</a>");
319        let anchor = doc.select(&SELECTORS.a).next().unwrap();
320        assert!(extract_link(&anchor, None).is_none());
321    }
322
323    #[test]
324    fn test_resolve_url_absolute() {
325        assert_eq!(
326            resolve_url("https://example.com/page", None),
327            Some("https://example.com/page".to_string())
328        );
329    }
330
331    #[test]
332    fn test_resolve_url_protocol_relative() {
333        assert_eq!(
334            resolve_url("//example.com/page", None),
335            Some("https://example.com/page".to_string())
336        );
337    }
338
339    #[test]
340    fn test_resolve_url_relative() {
341        let base = Url::parse("https://example.com/dir/").unwrap();
342        assert_eq!(
343            resolve_url("page.html", Some(&base)),
344            Some("https://example.com/dir/page.html".to_string())
345        );
346    }
347
348    #[test]
349    fn test_resolve_url_root_relative() {
350        let base = Url::parse("https://example.com/dir/page").unwrap();
351        assert_eq!(
352            resolve_url("/other", Some(&base)),
353            Some("https://example.com/other".to_string())
354        );
355    }
356
357    #[test]
358    fn test_normalize_url_removes_fragment() {
359        assert_eq!(
360            normalize_url("https://example.com/page#section"),
361            Some("https://example.com/page".to_string())
362        );
363    }
364
365    #[test]
366    fn test_determine_link_type_internal() {
367        let base = Url::parse("https://example.com").unwrap();
368        let url = Some("https://example.com/page".to_string());
369        assert_eq!(determine_link_type(&url, Some(&base)), LinkType::Internal);
370    }
371
372    #[test]
373    fn test_determine_link_type_subdomain() {
374        let base = Url::parse("https://example.com").unwrap();
375        let url = Some("https://blog.example.com/page".to_string());
376        assert_eq!(determine_link_type(&url, Some(&base)), LinkType::Internal);
377    }
378
379    #[test]
380    fn test_determine_link_type_external() {
381        let base = Url::parse("https://example.com").unwrap();
382        let url = Some("https://other.com/page".to_string());
383        assert_eq!(determine_link_type(&url, Some(&base)), LinkType::External);
384    }
385
386    #[test]
387    fn test_parse_rel_attribute() {
388        let rels = parse_rel_attribute("nofollow ugc sponsored");
389        assert!(rels.contains(&LinkRel::NoFollow));
390        assert!(rels.contains(&LinkRel::Ugc));
391        assert!(rels.contains(&LinkRel::Sponsored));
392    }
393
394    #[test]
395    fn test_is_nofollow() {
396        assert!(is_nofollow("nofollow"));
397        assert!(is_nofollow("nofollow external"));
398        assert!(is_nofollow("external nofollow"));
399        assert!(!is_nofollow("external"));
400    }
401
402    #[test]
403    fn test_filter_internal_links() {
404        let links = vec![
405            Link { link_type: LinkType::Internal, ..Link::new("/a", "A") },
406            Link { link_type: LinkType::External, ..Link::new("https://ext.com", "B") },
407            Link { link_type: LinkType::Internal, ..Link::new("/b", "C") },
408        ];
409        let internal = filter_internal_links(&links);
410        assert_eq!(internal.len(), 2);
411    }
412
413    #[test]
414    fn test_filter_followable_links() {
415        let mut nofollow = Link::new("/page", "Page");
416        nofollow.is_nofollow = true;
417        
418        let links = vec![
419            Link::new("/a", "A"),
420            nofollow,
421            Link::new("/b", "B"),
422        ];
423        let followable = filter_followable_links(&links);
424        assert_eq!(followable.len(), 2);
425    }
426
427    #[test]
428    fn test_get_external_domains() {
429        let links = vec![
430            Link { 
431                link_type: LinkType::External, 
432                url: Some("https://example.com/page".to_string()),
433                ..Link::new("https://example.com/page", "A") 
434            },
435            Link { 
436                link_type: LinkType::External, 
437                url: Some("https://other.com/page".to_string()),
438                ..Link::new("https://other.com/page", "B") 
439            },
440            Link { 
441                link_type: LinkType::External, 
442                url: Some("https://example.com/other".to_string()),
443                ..Link::new("https://example.com/other", "C") 
444            },
445        ];
446        let domains = get_external_domains(&links);
447        assert_eq!(domains.len(), 2);
448        assert!(domains.contains("example.com"));
449        assert!(domains.contains("other.com"));
450    }
451
452    #[test]
453    fn test_calculate_link_stats() {
454        let mut nofollow = Link::new("/page", "Page");
455        nofollow.is_nofollow = true;
456        nofollow.link_type = LinkType::Internal;
457        
458        let mut sponsored = Link::new("https://ad.com", "Ad");
459        sponsored.rel = vec![LinkRel::Sponsored];
460        sponsored.link_type = LinkType::External;
461        
462        let links = vec![
463            Link { link_type: LinkType::Internal, ..Link::new("/a", "A") },
464            nofollow,
465            sponsored,
466        ];
467        
468        let stats = calculate_link_stats(&links);
469        assert_eq!(stats.total, 3);
470        assert_eq!(stats.internal, 2);
471        assert_eq!(stats.external, 1);
472        assert_eq!(stats.nofollow, 1);
473        assert_eq!(stats.sponsored, 1);
474    }
475
476    #[test]
477    fn test_deduplicate_links() {
478        let doc = parse_html(r#"
479            <html><body>
480                <a href="https://example.com">First</a>
481                <a href="https://example.com">Duplicate</a>
482                <a href="https://other.com">Other</a>
483            </body></html>
484        "#);
485        let config = ParserConfig::default();
486        let links = extract_links(&doc, &config).unwrap();
487        assert_eq!(links.len(), 2);
488    }
489}