html_meta_scraper/
lib.rs

1pub struct MetaScraper {
2    document: scraper::Html,
3}
4
5impl MetaScraper {
6    pub fn new(html: &str) -> Self {
7        let document = scraper::Html::parse_document(html);
8
9        MetaScraper { document }
10    }
11
12    /// Expected Output: `"Page Title"`
13    /// ```html
14    /// <title>Page Title</title>
15    /// ```
16    pub fn extract_title(&self) -> Option<String> {
17        self.document
18            .select(&scraper::Selector::parse("title").unwrap())
19            .next()
20            .map(|element| element.text().collect::<String>())
21    }
22
23    /// Expected Output: `"Page Title"`
24    /// ```html
25    /// <meta property="og:title" content="Page Title" />
26    /// ```
27    pub fn extract_og_title(&self) -> Option<String> {
28        let og_title_selector = scraper::Selector::parse("meta[property='og:title']").unwrap();
29
30        let og_title = self
31            .document
32            .select(&og_title_selector)
33            .next()
34            .and_then(|element| {
35                element
36                    .value()
37                    .attr("content")
38                    .map(|content| content.to_string())
39            });
40
41        og_title
42    }
43
44    /// Expected Output: `"Page Title"`
45    /// ```html
46    /// <meta name="twitter:title" content="Page Title" />
47    /// ```
48    pub fn extract_twitter_title(&self) -> Option<String> {
49        let twitter_title_selector =
50            scraper::Selector::parse("meta[name='twitter:title']").unwrap();
51
52        let twitter_title = self
53            .document
54            .select(&twitter_title_selector)
55            .next()
56            .and_then(|element| {
57                element
58                    .value()
59                    .attr("content")
60                    .map(|content| content.to_string())
61            });
62
63        twitter_title
64    }
65
66    /// Retrieves the page title.
67    ///
68    /// Priority order:
69    /// 1. `<meta property="og:title">`
70    /// 2. `<meta name="twitter:title">`
71    /// 3. `<title>`
72    ///
73    /// Returns the first one found.
74    pub fn title(&self) -> Option<String> {
75        self.extract_og_title()
76            .or_else(|| self.extract_twitter_title())
77            .or_else(|| self.extract_title())
78    }
79
80    /// Expected Output: `"My Description"`
81    /// ```html
82    /// <meta name="description" content="My Description" />
83    /// ```
84    pub fn extract_description(&self) -> Option<String> {
85        let description_selector = scraper::Selector::parse("meta[name='description']").unwrap();
86
87        let description = self
88            .document
89            .select(&description_selector)
90            .next()
91            .and_then(|element| {
92                element
93                    .value()
94                    .attr("content")
95                    .map(|content| content.to_string())
96            });
97
98        description
99    }
100
101    /// Expected Output: `"My Description"`
102    /// ```html
103    /// <meta property="og:description" content="My Description" />
104    /// ```
105    pub fn extract_og_description(&self) -> Option<String> {
106        let og_description_selector =
107            scraper::Selector::parse("meta[property='og:description']").unwrap();
108
109        let og_description = self
110            .document
111            .select(&og_description_selector)
112            .next()
113            .and_then(|element| {
114                element
115                    .value()
116                    .attr("content")
117                    .map(|content| content.to_string())
118            });
119
120        og_description
121    }
122
123    /// Expected Output: `"My Description"`
124    /// ```html
125    /// <meta name="twitter:description" content="My Description" />
126    /// ```
127    pub fn extract_twitter_description(&self) -> Option<String> {
128        let twitter_description_selector =
129            scraper::Selector::parse("meta[name='twitter:description']").unwrap();
130
131        let twitter_description = self
132            .document
133            .select(&twitter_description_selector)
134            .next()
135            .and_then(|element| {
136                element
137                    .value()
138                    .attr("content")
139                    .map(|content| content.to_string())
140            });
141
142        twitter_description
143    }
144
145    /// Retrieves the page description.
146    ///
147    /// Priority order:
148    /// 1. `<meta property="og:description">`
149    /// 2. `<meta name="twitter:description">`
150    /// 3. `<meta name="description">`
151    ///
152    /// Returns the first one found.
153    pub fn description(&self) -> Option<String> {
154        self.extract_og_description()
155            .or_else(|| self.extract_twitter_description())
156            .or_else(|| self.extract_description())
157    }
158
159    /// Expected Output: `"/favicon.ico"`
160    /// ```html
161    /// <link rel="icon" href="/favicon.ico" />
162    /// ```
163    pub fn favicon(&self) -> Option<String> {
164        let favicon_selector = scraper::Selector::parse("link[rel='icon']").unwrap();
165
166        let favicon = self
167            .document
168            .select(&favicon_selector)
169            .next()
170            .and_then(|element| element.value().attr("href").map(|href| href.to_string()));
171
172        favicon
173    }
174
175    /// Expected Output: `"https://example.com/image.jpg"`
176    /// ```html
177    /// <meta property="og:image" content="https://example.com/image.jpg" />
178    /// ```
179    pub fn extract_og_image(&self) -> Option<String> {
180        let og_image_selector = scraper::Selector::parse("meta[property='og:image']").unwrap();
181
182        let og_image = self
183            .document
184            .select(&og_image_selector)
185            .next()
186            .and_then(|element| {
187                element
188                    .value()
189                    .attr("content")
190                    .map(|content| content.to_string())
191            });
192
193        og_image
194    }
195
196    /// Expected Output: `["https://example.com/image.jpg", "https://example.com/image.png"]`
197    /// ```html
198    /// <meta property="og:image" content="https://example.com/image.jpg" />
199    /// <meta property="og:image" content="https://example.com/image.png" />
200    /// ```
201    pub fn extract_og_images(&self) -> Vec<String> {
202        let og_image_selector = scraper::Selector::parse("meta[property='og:image']").unwrap();
203
204        let og_images = self
205            .document
206            .select(&og_image_selector)
207            .into_iter()
208            .filter_map(|element| {
209                element
210                    .value()
211                    .attr("content")
212                    .map(|content| content.to_string())
213            })
214            .collect::<Vec<String>>();
215
216        og_images
217    }
218
219    /// Expected Output: `"https://example.com/image.jpg"`
220    /// ```html
221    /// <meta name="twitter:image" content="https://example.com/image.jpg" />
222    /// <meta name="twitter:image:alt" content="Image description" />
223    /// ```
224    pub fn extract_twitter_image(&self) -> Option<String> {
225        let twitter_image_selector =
226            scraper::Selector::parse("meta[name='twitter:image']").unwrap();
227
228        let twitter_image = self
229            .document
230            .select(&twitter_image_selector)
231            .next()
232            .and_then(|element| {
233                element
234                    .value()
235                    .attr("content")
236                    .map(|content| content.to_string())
237            });
238
239        twitter_image
240    }
241
242    /// Retrieves the page image URL.
243    ///
244    /// Priority order:
245    /// 1. `<meta property="og:image">`
246    /// 2. `<meta name="twitter:image">`
247    ///
248    /// Returns the first one found.
249    pub fn image(&self) -> Option<String> {
250        self.extract_og_image()
251            .or_else(|| self.extract_twitter_image())
252    }
253
254    /// Expected Output: `"en"`
255    /// ```html
256    /// <html lang="en">
257    /// ...
258    /// </html>
259    /// ```
260    pub fn lang(&self) -> Option<String> {
261        let html_selector = scraper::Selector::parse("html").unwrap();
262
263        let lang = self
264            .document
265            .select(&html_selector)
266            .next()
267            .and_then(|element| {
268                element
269                    .value()
270                    .attr("lang")
271                    .map(|content| content.to_string())
272            });
273
274        lang
275    }
276}
277
278#[cfg(test)]
279mod test {
280    use super::*;
281
282    #[test]
283    fn extract_title() {
284        let scraper = MetaScraper::new(r#"<title>Page Title</title>"#);
285
286        let title = scraper.extract_title();
287
288        assert_eq!(title, Some("Page Title".to_string()));
289    }
290
291    #[test]
292    fn extract_og_title() {
293        let scraper = MetaScraper::new(r#"<meta property="og:title" content="Page Title" />"#);
294
295        let og_title = scraper.extract_og_title();
296
297        assert_eq!(og_title, Some("Page Title".to_string()));
298    }
299
300    #[test]
301    fn extract_twitter_title() {
302        let scraper = MetaScraper::new(r#"<meta name="twitter:title" content="Page Title" />"#);
303
304        let og_title = scraper.extract_twitter_title();
305
306        assert_eq!(og_title, Some("Page Title".to_string()));
307    }
308
309    #[test]
310    fn extract_description() {
311        let scraper = MetaScraper::new(r#"<meta name="description" content="My Description" />"#);
312
313        let description = scraper.extract_description();
314
315        assert_eq!(description, Some("My Description".to_string()));
316    }
317
318    #[test]
319    fn extract_og_description() {
320        let scraper =
321            MetaScraper::new(r#"<meta property="og:description" content="My Description" />"#);
322
323        let og_description = scraper.extract_og_description();
324
325        assert_eq!(og_description, Some("My Description".to_string()));
326    }
327
328    #[test]
329    fn extract_twitter_description() {
330        let scraper =
331            MetaScraper::new(r#"<meta name="twitter:description" content="My Description" />"#);
332
333        let twitter_description = scraper.extract_twitter_description();
334
335        assert_eq!(twitter_description, Some("My Description".to_string()));
336    }
337
338    #[test]
339    fn favicon() {
340        let scraper = MetaScraper::new(r#"<link rel="icon" href="/favicon.ico" />"#);
341
342        let favicon = scraper.favicon();
343
344        assert_eq!(favicon, Some("/favicon.ico".to_string()));
345    }
346
347    #[test]
348    fn extract_og_image() {
349        let scraper = MetaScraper::new(
350            r#"<meta property="og:image" content="https://example.com/image.jpg" />"#,
351        );
352
353        let og_image = scraper.extract_og_image();
354
355        assert_eq!(og_image, Some("https://example.com/image.jpg".to_string()));
356    }
357
358    #[test]
359    fn extract_og_images() {
360        let scraper = MetaScraper::new(
361            r#"
362            <meta property="og:image" content="https://example.com/image.jpg" />
363            <meta property="og:image" content="https://example.com/image.png" />"#,
364        );
365
366        let og_image = scraper.extract_og_images();
367
368        assert_eq!(
369            og_image,
370            vec![
371                "https://example.com/image.jpg".to_string(),
372                "https://example.com/image.png".to_string()
373            ]
374        );
375    }
376
377    #[test]
378    fn extract_twitter_image() {
379        let scraper = MetaScraper::new(
380            r#"<meta name="twitter:image" content="https://example.com/image.jpg" />"#,
381        );
382
383        let twitter_image = scraper.extract_twitter_image();
384
385        assert_eq!(
386            twitter_image,
387            Some("https://example.com/image.jpg".to_string())
388        );
389    }
390
391    #[test]
392    fn lang() {
393        let scraper = MetaScraper::new(
394            r#"
395            <html lang="en">
396            ...
397            </html>
398        "#,
399        );
400
401        let lang = scraper.lang();
402
403        assert_eq!(lang, Some("en".to_owned()));
404    }
405}