Skip to main content

meshlet_core/
fetch.rs

1use std::time::Duration;
2
3use reqwest::blocking::Client;
4use scraper::{Html, Selector};
5
6#[derive(Debug, Clone)]
7pub struct FetchResult {
8    pub final_url: String,
9    pub title: Option<String>,
10    pub desc: Option<String>,
11    pub tags: Vec<String>,
12    pub status: u16,
13    pub is_mime: bool,
14    pub bad: bool,
15}
16
17pub fn fetch_bookmark_data(url: &str) -> FetchResult {
18    let parsed = match url::Url::parse(url) {
19        Ok(u) => u,
20        Err(_) => {
21            return FetchResult {
22                final_url: url.to_string(),
23                title: None,
24                desc: None,
25                tags: vec![],
26                status: 0,
27                is_mime: false,
28                bad: true,
29            };
30        }
31    };
32
33    if parsed.scheme() != "http" && parsed.scheme() != "https" {
34        return FetchResult {
35            final_url: url.to_string(),
36            title: None,
37            desc: None,
38            tags: vec![],
39            status: 0,
40            is_mime: false,
41            bad: true,
42        };
43    }
44
45    let client = match Client::builder()
46        .timeout(Duration::from_secs(30))
47        .user_agent(
48            "Mozilla/5.0 (compatible; Meshlet/0.1; +https://github.com/meshlet)",
49        )
50        .redirect(reqwest::redirect::Policy::limited(5))
51        .danger_accept_invalid_certs(false)
52        .build()
53    {
54        Ok(c) => c,
55        Err(_) => {
56            return FetchResult {
57                final_url: url.to_string(),
58                title: None,
59                desc: None,
60                tags: vec![],
61                status: 0,
62                is_mime: false,
63                bad: true,
64            };
65        }
66    };
67
68    match client.head(url).send() {
69        Ok(resp) => {
70            let status = resp.status().as_u16();
71            let final_url = resp.url().to_string();
72            let content_type = resp
73                .headers()
74                .get(reqwest::header::CONTENT_TYPE)
75                .and_then(|v| v.to_str().ok())
76                .unwrap_or("");
77
78            let is_html = content_type.contains("text/html")
79                || content_type.contains("application/xhtml+xml");
80
81            if !is_html {
82                return FetchResult {
83                    final_url,
84                    title: None,
85                    desc: None,
86                    tags: vec![],
87                    status,
88                    is_mime: content_type.contains("application/")
89                        || content_type.contains("image/")
90                        || content_type.contains("audio/")
91                        || content_type.contains("video/"),
92                    bad: status >= 400,
93                };
94            }
95
96            if status >= 400 {
97                return FetchResult {
98                    final_url,
99                    title: None,
100                    desc: None,
101                    tags: vec![],
102                    status,
103                    is_mime: false,
104                    bad: true,
105                };
106            }
107
108            match client.get(url).send() {
109                Ok(get_resp) => {
110                    let final_url = get_resp.url().to_string();
111                    let status = get_resp.status().as_u16();
112
113                    if status >= 400 {
114                        return FetchResult {
115                            final_url,
116                            title: None,
117                            desc: None,
118                            tags: vec![],
119                            status,
120                            is_mime: false,
121                            bad: true,
122                        };
123                    }
124
125                    let body = match get_resp.text() {
126                        Ok(t) => t,
127                        Err(_) => {
128                            return FetchResult {
129                                final_url,
130                                title: None,
131                                desc: None,
132                                tags: vec![],
133                                status,
134                                is_mime: false,
135                                bad: true,
136                            };
137                        }
138                    };
139
140                    let document = Html::parse_document(&body);
141                    let title = extract_title(&document);
142                    let desc = extract_meta(&document, "description");
143                    let tags = extract_keywords(&document);
144
145                    FetchResult {
146                        final_url,
147                        title,
148                        desc,
149                        tags,
150                        status,
151                        is_mime: false,
152                        bad: false,
153                    }
154                }
155                Err(_) => FetchResult {
156                    final_url: url.to_string(),
157                    title: None,
158                    desc: None,
159                    tags: vec![],
160                    status: 0,
161                    is_mime: false,
162                    bad: true,
163                },
164            }
165        }
166        Err(_) => FetchResult {
167            final_url: url.to_string(),
168            title: None,
169            desc: None,
170            tags: vec![],
171            status: 0,
172            is_mime: false,
173            bad: true,
174        },
175    }
176}
177
178fn extract_title(document: &Html) -> Option<String> {
179    let selector = Selector::parse("title").ok()?;
180    document
181        .select(&selector)
182        .next()
183        .map(|el| el.text().collect::<Vec<_>>().join(""))
184        .map(|s| s.split_whitespace().collect::<Vec<_>>().join(" "))
185        .filter(|s| !s.is_empty())
186}
187
188fn extract_meta(document: &Html, name: &str) -> Option<String> {
189    let selector =
190        Selector::parse(&format!("meta[name=\"{}\"]", name)).ok()?;
191    document
192        .select(&selector)
193        .next()
194        .and_then(|el| el.value().attr("content"))
195        .map(|s| s.trim().to_string())
196        .filter(|s| !s.is_empty())
197}
198
199fn extract_keywords(document: &Html) -> Vec<String> {
200    let selector = Selector::parse("meta[name=\"keywords\"]").ok();
201    let content = selector.and_then(|sel| {
202        document
203            .select(&sel)
204            .next()
205            .and_then(|el| el.value().attr("content"))
206            .map(|s| s.to_string())
207    });
208
209    match content {
210        Some(s) => s
211            .split(',')
212            .map(|kw| kw.trim().to_string())
213            .filter(|kw| !kw.is_empty())
214            .collect(),
215        None => vec![],
216    }
217}
218
219#[cfg(test)]
220mod tests {
221    use super::*;
222
223    static HTML_SIMPLE: &str = r#"<!DOCTYPE html>
224<html>
225<head>
226    <title>Simple Test Page</title>
227    <meta name="description" content="A simple test page for testing">
228    <meta name="keywords" content="test, simple, rust">
229</head>
230<body><p>Hello world</p></body>
231</html>"#;
232
233    static HTML_NO_DESC: &str = r#"<!DOCTYPE html>
234<html>
235<head>
236    <title>No Description Page</title>
237</head>
238<body><p>No meta tags here</p></body>
239</html>"#;
240
241    static HTML_EMPTY: &str = r#"<!DOCTYPE html>
242<html>
243<head>
244    <title></title>
245</head>
246<body></body>
247</html>"#;
248
249    static HTML_UNICODE: &str = r#"<!DOCTYPE html>
250<html>
251<head>
252    <title>Café & Crème — Spécial</title>
253    <meta name="description" content="Testing unicode café characters">
254    <meta name="keywords" content="café, crème, ünicode">
255</head>
256<body></body>
257</html>"#;
258
259    static HTML_TITLE_WITH_WHITESPACE: &str = r#"<!DOCTYPE html>
260<html>
261<head>
262    <title>
263        Multi-line
264        Title
265    </title>
266    <meta name="description" content="   ">
267    <meta name="keywords" content="tag1, , tag2, ">
268</head>
269<body></body>
270</html>"#;
271
272    #[test]
273    fn test_extract_simple_title() {
274        let doc = Html::parse_document(HTML_SIMPLE);
275        assert_eq!(extract_title(&doc), Some("Simple Test Page".into()));
276    }
277
278    #[test]
279    fn test_extract_description() {
280        let doc = Html::parse_document(HTML_SIMPLE);
281        assert_eq!(
282            extract_meta(&doc, "description"),
283            Some("A simple test page for testing".into())
284        );
285    }
286
287    #[test]
288    fn test_extract_keywords() {
289        let doc = Html::parse_document(HTML_SIMPLE);
290        let tags = extract_keywords(&doc);
291        assert_eq!(tags, vec!["test", "simple", "rust"]);
292    }
293
294    #[test]
295    fn test_no_description() {
296        let doc = Html::parse_document(HTML_NO_DESC);
297        assert_eq!(extract_title(&doc), Some("No Description Page".into()));
298        assert_eq!(extract_meta(&doc, "description"), None);
299        assert!(extract_keywords(&doc).is_empty());
300    }
301
302    #[test]
303    fn test_empty_title() {
304        let doc = Html::parse_document(HTML_EMPTY);
305        assert_eq!(extract_title(&doc), None);
306    }
307
308    #[test]
309    fn test_unicode_handling() {
310        let doc = Html::parse_document(HTML_UNICODE);
311        assert_eq!(extract_title(&doc), Some("Café & Crème — Spécial".into()));
312        assert_eq!(
313            extract_meta(&doc, "description"),
314            Some("Testing unicode café characters".into())
315        );
316        assert_eq!(
317            extract_keywords(&doc),
318            vec!["café", "crème", "ünicode"]
319        );
320    }
321
322    #[test]
323    fn test_whitespace_handling() {
324        let doc = Html::parse_document(HTML_TITLE_WITH_WHITESPACE);
325        assert_eq!(extract_title(&doc), Some("Multi-line Title".into()));
326        assert_eq!(extract_meta(&doc, "description"), None);
327        assert_eq!(extract_keywords(&doc), vec!["tag1", "tag2"]);
328    }
329
330    #[test]
331    fn test_bad_url() {
332        let result = fetch_bookmark_data("not-a-valid-url!!!");
333        assert!(result.bad);
334    }
335
336    #[test]
337    fn test_non_http_url() {
338        let result = fetch_bookmark_data("ftp://example.com/file");
339        assert!(result.bad);
340    }
341}