1use argus_common::ExtractedLink;
2use scraper::{Html, Selector};
3use url::Url;
4
5#[derive(Debug, Clone, Default)]
6pub struct PageMetadata {
7 pub canonical_url: Option<String>,
8 pub alternate_urls: Vec<String>,
9 pub title: Option<String>,
10 pub description: Option<String>,
11}
12
13pub fn extract_links(base_url: &str, body: &[u8]) -> Vec<ExtractedLink> {
14 let html = match std::str::from_utf8(body) {
15 Ok(s) => s,
16 Err(_) => return vec![],
17 };
18
19 let base = match Url::parse(base_url) {
20 Ok(u) => u,
21 Err(_) => return vec![],
22 };
23
24 let document = Html::parse_document(html);
25 let mut links = Vec::new();
26
27 if let Ok(selector) = Selector::parse("a[href]") {
28 for el in document.select(&selector) {
29 if let Some(href) = el.value().attr("href") {
30 if let Ok(url) = base.join(href) {
31 links.push(ExtractedLink {
32 from_url: base_url.to_string(),
33 to_url: url.to_string(),
34 });
35 }
36 }
37 }
38 }
39
40 if let Ok(selector) = Selector::parse("link[rel='alternate'][href]") {
41 for el in document.select(&selector) {
42 if let Some(href) = el.value().attr("href") {
43 if let Ok(url) = base.join(href) {
44 links.push(ExtractedLink {
45 from_url: base_url.to_string(),
46 to_url: url.to_string(),
47 });
48 }
49 }
50 }
51 }
52
53 links
54}
55
56pub fn extract_metadata(body: &[u8]) -> PageMetadata {
57 let html = match std::str::from_utf8(body) {
58 Ok(s) => s,
59 Err(_) => return PageMetadata::default(),
60 };
61
62 let document = Html::parse_document(html);
63 let mut metadata = PageMetadata::default();
64
65 if let Ok(selector) = Selector::parse("link[rel='canonical'][href]") {
66 if let Some(el) = document.select(&selector).next() {
67 metadata.canonical_url = el.value().attr("href").map(|s| s.to_string());
68 }
69 }
70
71 if let Ok(selector) = Selector::parse("link[rel='alternate'][hreflang][href]") {
72 for el in document.select(&selector) {
73 if let Some(href) = el.value().attr("href") {
74 metadata.alternate_urls.push(href.to_string());
75 }
76 }
77 }
78
79 if let Ok(selector) = Selector::parse("title") {
80 if let Some(el) = document.select(&selector).next() {
81 metadata.title = Some(el.text().collect::<String>().trim().to_string());
82 }
83 }
84
85 if let Ok(selector) = Selector::parse("meta[name='description'][content]") {
86 if let Some(el) = document.select(&selector).next() {
87 metadata.description = el.value().attr("content").map(|s| s.to_string());
88 }
89 }
90
91 metadata
92}
93
94#[cfg(test)]
95mod tests {
96 use super::*;
97
98 #[test]
99 fn extracts_absolute_link() {
100 let html = b"<a href=\"https://example.com/other\">link</a>";
101 let links = extract_links("https://example.com/page", html);
102 assert_eq!(links.len(), 1);
103 assert_eq!(links[0].from_url, "https://example.com/page");
104 assert_eq!(links[0].to_url, "https://example.com/other");
105 }
106
107 #[test]
108 fn resolves_relative_link() {
109 let html = b"<a href=\"/about\">about</a>";
110 let links = extract_links("https://example.com/", html);
111 assert_eq!(links.len(), 1);
112 assert_eq!(links[0].to_url, "https://example.com/about");
113 }
114
115 #[test]
116 fn returns_empty_for_invalid_utf8() {
117 let body = b"\xff\xfe";
118 let links = extract_links("https://example.com/", body);
119 assert!(links.is_empty());
120 }
121
122 #[test]
123 fn returns_empty_for_no_links() {
124 let html = b"<p>no links here</p>";
125 let links = extract_links("https://example.com/", html);
126 assert!(links.is_empty());
127 }
128
129 #[test]
130 fn extracts_canonical_url() {
131 let html = b"<link rel=\"canonical\" href=\"https://example.com/canonical\">";
132 let metadata = extract_metadata(html);
133 assert_eq!(
134 metadata.canonical_url,
135 Some("https://example.com/canonical".to_string())
136 );
137 }
138
139 #[test]
140 fn extracts_alternate_urls() {
141 let html = b"<link rel=\"alternate\" hreflang=\"es\" href=\"https://example.com/es\">\
142 <link rel=\"alternate\" hreflang=\"fr\" href=\"https://example.com/fr\">";
143 let metadata = extract_metadata(html);
144 assert_eq!(metadata.alternate_urls.len(), 2);
145 assert!(metadata
146 .alternate_urls
147 .contains(&"https://example.com/es".to_string()));
148 assert!(metadata
149 .alternate_urls
150 .contains(&"https://example.com/fr".to_string()));
151 }
152
153 #[test]
154 fn extracts_title_and_description() {
155 let html = b"<title>Page Title</title>\
156 <meta name=\"description\" content=\"Page description\">";
157 let metadata = extract_metadata(html);
158 assert_eq!(metadata.title, Some("Page Title".to_string()));
159 assert_eq!(metadata.description, Some("Page description".to_string()));
160 }
161
162 #[test]
163 fn extracts_alternate_links() {
164 let html = b"<a href=\"/page1\">Link 1</a>\
165 <link rel=\"alternate\" href=\"/page2\">";
166 let links = extract_links("https://example.com/", html);
167 assert_eq!(links.len(), 2);
168 assert!(links
169 .iter()
170 .any(|l| l.to_url == "https://example.com/page1"));
171 assert!(links
172 .iter()
173 .any(|l| l.to_url == "https://example.com/page2"));
174 }
175}