argus_parser/
html.rs

1use argus_common::ExtractedLink;
2use scraper::{Html, Selector};
3use url::Url;
4
5#[derive(Debug, Clone, Default)]
6pub struct PageMetadata {
7    pub canonical_url: Option<String>,
8    pub alternate_urls: Vec<String>,
9    pub title: Option<String>,
10    pub description: Option<String>,
11}
12
13pub fn extract_links(base_url: &str, body: &[u8]) -> Vec<ExtractedLink> {
14    let html = match std::str::from_utf8(body) {
15        Ok(s) => s,
16        Err(_) => return vec![],
17    };
18
19    let base = match Url::parse(base_url) {
20        Ok(u) => u,
21        Err(_) => return vec![],
22    };
23
24    let document = Html::parse_document(html);
25    let mut links = Vec::new();
26
27    if let Ok(selector) = Selector::parse("a[href]") {
28        for el in document.select(&selector) {
29            if let Some(href) = el.value().attr("href") {
30                if let Ok(url) = base.join(href) {
31                    links.push(ExtractedLink {
32                        from_url: base_url.to_string(),
33                        to_url: url.to_string(),
34                    });
35                }
36            }
37        }
38    }
39
40    if let Ok(selector) = Selector::parse("link[rel='alternate'][href]") {
41        for el in document.select(&selector) {
42            if let Some(href) = el.value().attr("href") {
43                if let Ok(url) = base.join(href) {
44                    links.push(ExtractedLink {
45                        from_url: base_url.to_string(),
46                        to_url: url.to_string(),
47                    });
48                }
49            }
50        }
51    }
52
53    links
54}
55
56pub fn extract_metadata(body: &[u8]) -> PageMetadata {
57    let html = match std::str::from_utf8(body) {
58        Ok(s) => s,
59        Err(_) => return PageMetadata::default(),
60    };
61
62    let document = Html::parse_document(html);
63    let mut metadata = PageMetadata::default();
64
65    if let Ok(selector) = Selector::parse("link[rel='canonical'][href]") {
66        if let Some(el) = document.select(&selector).next() {
67            metadata.canonical_url = el.value().attr("href").map(|s| s.to_string());
68        }
69    }
70
71    if let Ok(selector) = Selector::parse("link[rel='alternate'][hreflang][href]") {
72        for el in document.select(&selector) {
73            if let Some(href) = el.value().attr("href") {
74                metadata.alternate_urls.push(href.to_string());
75            }
76        }
77    }
78
79    if let Ok(selector) = Selector::parse("title") {
80        if let Some(el) = document.select(&selector).next() {
81            metadata.title = Some(el.text().collect::<String>().trim().to_string());
82        }
83    }
84
85    if let Ok(selector) = Selector::parse("meta[name='description'][content]") {
86        if let Some(el) = document.select(&selector).next() {
87            metadata.description = el.value().attr("content").map(|s| s.to_string());
88        }
89    }
90
91    metadata
92}
93
94#[cfg(test)]
95mod tests {
96    use super::*;
97
98    #[test]
99    fn extracts_absolute_link() {
100        let html = b"<a href=\"https://example.com/other\">link</a>";
101        let links = extract_links("https://example.com/page", html);
102        assert_eq!(links.len(), 1);
103        assert_eq!(links[0].from_url, "https://example.com/page");
104        assert_eq!(links[0].to_url, "https://example.com/other");
105    }
106
107    #[test]
108    fn resolves_relative_link() {
109        let html = b"<a href=\"/about\">about</a>";
110        let links = extract_links("https://example.com/", html);
111        assert_eq!(links.len(), 1);
112        assert_eq!(links[0].to_url, "https://example.com/about");
113    }
114
115    #[test]
116    fn returns_empty_for_invalid_utf8() {
117        let body = b"\xff\xfe";
118        let links = extract_links("https://example.com/", body);
119        assert!(links.is_empty());
120    }
121
122    #[test]
123    fn returns_empty_for_no_links() {
124        let html = b"<p>no links here</p>";
125        let links = extract_links("https://example.com/", html);
126        assert!(links.is_empty());
127    }
128
129    #[test]
130    fn extracts_canonical_url() {
131        let html = b"<link rel=\"canonical\" href=\"https://example.com/canonical\">";
132        let metadata = extract_metadata(html);
133        assert_eq!(
134            metadata.canonical_url,
135            Some("https://example.com/canonical".to_string())
136        );
137    }
138
139    #[test]
140    fn extracts_alternate_urls() {
141        let html = b"<link rel=\"alternate\" hreflang=\"es\" href=\"https://example.com/es\">\
142                     <link rel=\"alternate\" hreflang=\"fr\" href=\"https://example.com/fr\">";
143        let metadata = extract_metadata(html);
144        assert_eq!(metadata.alternate_urls.len(), 2);
145        assert!(metadata
146            .alternate_urls
147            .contains(&"https://example.com/es".to_string()));
148        assert!(metadata
149            .alternate_urls
150            .contains(&"https://example.com/fr".to_string()));
151    }
152
153    #[test]
154    fn extracts_title_and_description() {
155        let html = b"<title>Page Title</title>\
156                     <meta name=\"description\" content=\"Page description\">";
157        let metadata = extract_metadata(html);
158        assert_eq!(metadata.title, Some("Page Title".to_string()));
159        assert_eq!(metadata.description, Some("Page description".to_string()));
160    }
161
162    #[test]
163    fn extracts_alternate_links() {
164        let html = b"<a href=\"/page1\">Link 1</a>\
165                     <link rel=\"alternate\" href=\"/page2\">";
166        let links = extract_links("https://example.com/", html);
167        assert_eq!(links.len(), 2);
168        assert!(links
169            .iter()
170            .any(|l| l.to_url == "https://example.com/page1"));
171        assert!(links
172            .iter()
173            .any(|l| l.to_url == "https://example.com/page2"));
174    }
175}
argus_parser/html.rs

argus_parser/
html.rs