1use std::time::Duration;
2
3use reqwest::blocking::Client;
4use scraper::{Html, Selector};
5
6#[derive(Debug, Clone)]
7pub struct FetchResult {
8 pub final_url: String,
9 pub title: Option<String>,
10 pub desc: Option<String>,
11 pub tags: Vec<String>,
12 pub status: u16,
13 pub is_mime: bool,
14 pub bad: bool,
15}
16
17pub fn fetch_bookmark_data(url: &str) -> FetchResult {
18 let parsed = match url::Url::parse(url) {
19 Ok(u) => u,
20 Err(_) => {
21 return FetchResult {
22 final_url: url.to_string(),
23 title: None,
24 desc: None,
25 tags: vec![],
26 status: 0,
27 is_mime: false,
28 bad: true,
29 };
30 }
31 };
32
33 if parsed.scheme() != "http" && parsed.scheme() != "https" {
34 return FetchResult {
35 final_url: url.to_string(),
36 title: None,
37 desc: None,
38 tags: vec![],
39 status: 0,
40 is_mime: false,
41 bad: true,
42 };
43 }
44
45 let client = match Client::builder()
46 .timeout(Duration::from_secs(30))
47 .user_agent(
48 "Mozilla/5.0 (compatible; Meshlet/0.1; +https://github.com/meshlet)",
49 )
50 .redirect(reqwest::redirect::Policy::limited(5))
51 .danger_accept_invalid_certs(false)
52 .build()
53 {
54 Ok(c) => c,
55 Err(_) => {
56 return FetchResult {
57 final_url: url.to_string(),
58 title: None,
59 desc: None,
60 tags: vec![],
61 status: 0,
62 is_mime: false,
63 bad: true,
64 };
65 }
66 };
67
68 match client.head(url).send() {
69 Ok(resp) => {
70 let status = resp.status().as_u16();
71 let final_url = resp.url().to_string();
72 let content_type = resp
73 .headers()
74 .get(reqwest::header::CONTENT_TYPE)
75 .and_then(|v| v.to_str().ok())
76 .unwrap_or("");
77
78 let is_html = content_type.contains("text/html")
79 || content_type.contains("application/xhtml+xml");
80
81 if !is_html {
82 return FetchResult {
83 final_url,
84 title: None,
85 desc: None,
86 tags: vec![],
87 status,
88 is_mime: content_type.contains("application/")
89 || content_type.contains("image/")
90 || content_type.contains("audio/")
91 || content_type.contains("video/"),
92 bad: status >= 400,
93 };
94 }
95
96 if status >= 400 {
97 return FetchResult {
98 final_url,
99 title: None,
100 desc: None,
101 tags: vec![],
102 status,
103 is_mime: false,
104 bad: true,
105 };
106 }
107
108 match client.get(url).send() {
109 Ok(get_resp) => {
110 let final_url = get_resp.url().to_string();
111 let status = get_resp.status().as_u16();
112
113 if status >= 400 {
114 return FetchResult {
115 final_url,
116 title: None,
117 desc: None,
118 tags: vec![],
119 status,
120 is_mime: false,
121 bad: true,
122 };
123 }
124
125 let body = match get_resp.text() {
126 Ok(t) => t,
127 Err(_) => {
128 return FetchResult {
129 final_url,
130 title: None,
131 desc: None,
132 tags: vec![],
133 status,
134 is_mime: false,
135 bad: true,
136 };
137 }
138 };
139
140 let document = Html::parse_document(&body);
141 let title = extract_title(&document);
142 let desc = extract_meta(&document, "description");
143 let tags = extract_keywords(&document);
144
145 FetchResult {
146 final_url,
147 title,
148 desc,
149 tags,
150 status,
151 is_mime: false,
152 bad: false,
153 }
154 }
155 Err(_) => FetchResult {
156 final_url: url.to_string(),
157 title: None,
158 desc: None,
159 tags: vec![],
160 status: 0,
161 is_mime: false,
162 bad: true,
163 },
164 }
165 }
166 Err(_) => FetchResult {
167 final_url: url.to_string(),
168 title: None,
169 desc: None,
170 tags: vec![],
171 status: 0,
172 is_mime: false,
173 bad: true,
174 },
175 }
176}
177
178fn extract_title(document: &Html) -> Option<String> {
179 let selector = Selector::parse("title").ok()?;
180 document
181 .select(&selector)
182 .next()
183 .map(|el| el.text().collect::<Vec<_>>().join(""))
184 .map(|s| s.split_whitespace().collect::<Vec<_>>().join(" "))
185 .filter(|s| !s.is_empty())
186}
187
188fn extract_meta(document: &Html, name: &str) -> Option<String> {
189 let selector =
190 Selector::parse(&format!("meta[name=\"{}\"]", name)).ok()?;
191 document
192 .select(&selector)
193 .next()
194 .and_then(|el| el.value().attr("content"))
195 .map(|s| s.trim().to_string())
196 .filter(|s| !s.is_empty())
197}
198
199fn extract_keywords(document: &Html) -> Vec<String> {
200 let selector = Selector::parse("meta[name=\"keywords\"]").ok();
201 let content = selector.and_then(|sel| {
202 document
203 .select(&sel)
204 .next()
205 .and_then(|el| el.value().attr("content"))
206 .map(|s| s.to_string())
207 });
208
209 match content {
210 Some(s) => s
211 .split(',')
212 .map(|kw| kw.trim().to_string())
213 .filter(|kw| !kw.is_empty())
214 .collect(),
215 None => vec![],
216 }
217}
218
219#[cfg(test)]
220mod tests {
221 use super::*;
222
223 static HTML_SIMPLE: &str = r#"<!DOCTYPE html>
224<html>
225<head>
226 <title>Simple Test Page</title>
227 <meta name="description" content="A simple test page for testing">
228 <meta name="keywords" content="test, simple, rust">
229</head>
230<body><p>Hello world</p></body>
231</html>"#;
232
233 static HTML_NO_DESC: &str = r#"<!DOCTYPE html>
234<html>
235<head>
236 <title>No Description Page</title>
237</head>
238<body><p>No meta tags here</p></body>
239</html>"#;
240
241 static HTML_EMPTY: &str = r#"<!DOCTYPE html>
242<html>
243<head>
244 <title></title>
245</head>
246<body></body>
247</html>"#;
248
249 static HTML_UNICODE: &str = r#"<!DOCTYPE html>
250<html>
251<head>
252 <title>Café & Crème — Spécial</title>
253 <meta name="description" content="Testing unicode café characters">
254 <meta name="keywords" content="café, crème, ünicode">
255</head>
256<body></body>
257</html>"#;
258
259 static HTML_TITLE_WITH_WHITESPACE: &str = r#"<!DOCTYPE html>
260<html>
261<head>
262 <title>
263 Multi-line
264 Title
265 </title>
266 <meta name="description" content=" ">
267 <meta name="keywords" content="tag1, , tag2, ">
268</head>
269<body></body>
270</html>"#;
271
272 #[test]
273 fn test_extract_simple_title() {
274 let doc = Html::parse_document(HTML_SIMPLE);
275 assert_eq!(extract_title(&doc), Some("Simple Test Page".into()));
276 }
277
278 #[test]
279 fn test_extract_description() {
280 let doc = Html::parse_document(HTML_SIMPLE);
281 assert_eq!(
282 extract_meta(&doc, "description"),
283 Some("A simple test page for testing".into())
284 );
285 }
286
287 #[test]
288 fn test_extract_keywords() {
289 let doc = Html::parse_document(HTML_SIMPLE);
290 let tags = extract_keywords(&doc);
291 assert_eq!(tags, vec!["test", "simple", "rust"]);
292 }
293
294 #[test]
295 fn test_no_description() {
296 let doc = Html::parse_document(HTML_NO_DESC);
297 assert_eq!(extract_title(&doc), Some("No Description Page".into()));
298 assert_eq!(extract_meta(&doc, "description"), None);
299 assert!(extract_keywords(&doc).is_empty());
300 }
301
302 #[test]
303 fn test_empty_title() {
304 let doc = Html::parse_document(HTML_EMPTY);
305 assert_eq!(extract_title(&doc), None);
306 }
307
308 #[test]
309 fn test_unicode_handling() {
310 let doc = Html::parse_document(HTML_UNICODE);
311 assert_eq!(extract_title(&doc), Some("Café & Crème — Spécial".into()));
312 assert_eq!(
313 extract_meta(&doc, "description"),
314 Some("Testing unicode café characters".into())
315 );
316 assert_eq!(
317 extract_keywords(&doc),
318 vec!["café", "crème", "ünicode"]
319 );
320 }
321
322 #[test]
323 fn test_whitespace_handling() {
324 let doc = Html::parse_document(HTML_TITLE_WITH_WHITESPACE);
325 assert_eq!(extract_title(&doc), Some("Multi-line Title".into()));
326 assert_eq!(extract_meta(&doc, "description"), None);
327 assert_eq!(extract_keywords(&doc), vec!["tag1", "tag2"]);
328 }
329
330 #[test]
331 fn test_bad_url() {
332 let result = fetch_bookmark_data("not-a-valid-url!!!");
333 assert!(result.bad);
334 }
335
336 #[test]
337 fn test_non_http_url() {
338 let result = fetch_bookmark_data("ftp://example.com/file");
339 assert!(result.bad);
340 }
341}