Skip to main content

html_meta_scraper/
lib.rs

1//! Extract metadata from an HTML document — `<title>`, OpenGraph and
2//! Twitter Card tags, favicon, and the root `<html lang>` attribute.
3//!
4//! This crate parses HTML strings you provide; it does not fetch URLs.
5//!
6//! # Example
7//!
8//! ```
9//! use html_meta_scraper::MetaScraper;
10//!
11//! let html = r#"
12//!     <html lang="en">
13//!         <head>
14//!             <title>Native Title</title>
15//!             <meta property="og:title" content="OG Title" />
16//!             <meta property="og:image" content="https://example.com/cover.jpg" />
17//!         </head>
18//!     </html>
19//! "#;
20//!
21//! let m = MetaScraper::new(html);
22//! assert_eq!(m.title().as_deref(), Some("OG Title"));
23//! assert_eq!(m.image().as_deref(), Some("https://example.com/cover.jpg"));
24//! assert_eq!(m.lang().as_deref(), Some("en"));
25//! ```
26
27/// Holds a parsed HTML document and exposes accessors for common page
28/// metadata.
29///
30/// Construct one with [`MetaScraper::new`], then call the per-field
31/// accessors. The document is parsed once at construction time; accessors
32/// are read-only and may be called repeatedly.
33pub struct MetaScraper {
34    document: scraper::Html,
35}
36
37impl MetaScraper {
38    /// Parse an HTML document.
39    ///
40    /// Parsing is eager and lenient — malformed HTML will not panic.
41    pub fn new(html: &str) -> Self {
42        let document = scraper::Html::parse_document(html);
43
44        MetaScraper { document }
45    }
46
47    /// Returns the trimmed text of the first `<title>` element.
48    ///
49    /// Returns `None` if the tag is absent, empty, or whitespace-only.
50    ///
51    /// # Example
52    ///
53    /// ```
54    /// # use html_meta_scraper::MetaScraper;
55    /// let m = MetaScraper::new("<title>Page Title</title>");
56    /// assert_eq!(m.extract_title().as_deref(), Some("Page Title"));
57    /// ```
58    pub fn extract_title(&self) -> Option<String> {
59        self.document
60            .select(&scraper::Selector::parse("title").unwrap())
61            .next()
62            .map(|element| element.text().collect::<String>().trim().to_string())
63            .filter(|s| !s.is_empty())
64    }
65
66    /// Returns the OpenGraph title from `<meta property="og:title">`.
67    ///
68    /// Also accepts the non-conformant `<meta name="og:title">` variant
69    /// emitted by some CMSes. An empty `content` attribute is treated as
70    /// absent.
71    ///
72    /// # Example
73    ///
74    /// ```
75    /// # use html_meta_scraper::MetaScraper;
76    /// let m = MetaScraper::new(r#"<meta property="og:title" content="Hello" />"#);
77    /// assert_eq!(m.extract_og_title().as_deref(), Some("Hello"));
78    /// ```
79    pub fn extract_og_title(&self) -> Option<String> {
80        let og_title_selector =
81            scraper::Selector::parse("meta[property='og:title'], meta[name='og:title']").unwrap();
82
83        let og_title = self
84            .document
85            .select(&og_title_selector)
86            .next()
87            .and_then(|element| element.value().attr("content"))
88            .filter(|content| !content.is_empty())
89            .map(|content| content.to_string());
90
91        og_title
92    }
93
94    /// Returns the Twitter Card title from `<meta name="twitter:title">`.
95    ///
96    /// Also accepts `<meta property="twitter:title">`, which appears in
97    /// the wild. An empty `content` attribute is treated as absent.
98    ///
99    /// # Example
100    ///
101    /// ```
102    /// # use html_meta_scraper::MetaScraper;
103    /// let m = MetaScraper::new(r#"<meta name="twitter:title" content="Hello" />"#);
104    /// assert_eq!(m.extract_twitter_title().as_deref(), Some("Hello"));
105    /// ```
106    pub fn extract_twitter_title(&self) -> Option<String> {
107        let twitter_title_selector =
108            scraper::Selector::parse("meta[name='twitter:title'], meta[property='twitter:title']")
109                .unwrap();
110
111        let twitter_title = self
112            .document
113            .select(&twitter_title_selector)
114            .next()
115            .and_then(|element| element.value().attr("content"))
116            .filter(|content| !content.is_empty())
117            .map(|content| content.to_string());
118
119        twitter_title
120    }
121
122    /// Returns the page title, trying each source in turn and returning
123    /// the first match:
124    ///
125    /// 1. [`extract_og_title`](Self::extract_og_title) — `og:title`
126    /// 2. [`extract_twitter_title`](Self::extract_twitter_title) — `twitter:title`
127    /// 3. [`extract_title`](Self::extract_title) — `<title>`
128    pub fn title(&self) -> Option<String> {
129        self.extract_og_title()
130            .or_else(|| self.extract_twitter_title())
131            .or_else(|| self.extract_title())
132    }
133
134    /// Returns the standard description from `<meta name="description">`.
135    ///
136    /// An empty `content` attribute is treated as absent.
137    ///
138    /// # Example
139    ///
140    /// ```
141    /// # use html_meta_scraper::MetaScraper;
142    /// let m = MetaScraper::new(r#"<meta name="description" content="A page." />"#);
143    /// assert_eq!(m.extract_description().as_deref(), Some("A page."));
144    /// ```
145    pub fn extract_description(&self) -> Option<String> {
146        let description_selector = scraper::Selector::parse("meta[name='description']").unwrap();
147
148        let description = self
149            .document
150            .select(&description_selector)
151            .next()
152            .and_then(|element| element.value().attr("content"))
153            .filter(|content| !content.is_empty())
154            .map(|content| content.to_string());
155
156        description
157    }
158
159    /// Returns the OpenGraph description from
160    /// `<meta property="og:description">`.
161    ///
162    /// Also accepts the non-conformant `<meta name="og:description">`
163    /// variant. An empty `content` attribute is treated as absent.
164    ///
165    /// # Example
166    ///
167    /// ```
168    /// # use html_meta_scraper::MetaScraper;
169    /// let m = MetaScraper::new(r#"<meta property="og:description" content="A page." />"#);
170    /// assert_eq!(m.extract_og_description().as_deref(), Some("A page."));
171    /// ```
172    pub fn extract_og_description(&self) -> Option<String> {
173        let og_description_selector = scraper::Selector::parse(
174            "meta[property='og:description'], meta[name='og:description']",
175        )
176        .unwrap();
177
178        let og_description = self
179            .document
180            .select(&og_description_selector)
181            .next()
182            .and_then(|element| element.value().attr("content"))
183            .filter(|content| !content.is_empty())
184            .map(|content| content.to_string());
185
186        og_description
187    }
188
189    /// Returns the Twitter Card description from
190    /// `<meta name="twitter:description">`.
191    ///
192    /// Also accepts `<meta property="twitter:description">`. An empty
193    /// `content` attribute is treated as absent.
194    ///
195    /// # Example
196    ///
197    /// ```
198    /// # use html_meta_scraper::MetaScraper;
199    /// let m = MetaScraper::new(r#"<meta name="twitter:description" content="A page." />"#);
200    /// assert_eq!(m.extract_twitter_description().as_deref(), Some("A page."));
201    /// ```
202    pub fn extract_twitter_description(&self) -> Option<String> {
203        let twitter_description_selector = scraper::Selector::parse(
204            "meta[name='twitter:description'], meta[property='twitter:description']",
205        )
206        .unwrap();
207
208        let twitter_description = self
209            .document
210            .select(&twitter_description_selector)
211            .next()
212            .and_then(|element| element.value().attr("content"))
213            .filter(|content| !content.is_empty())
214            .map(|content| content.to_string());
215
216        twitter_description
217    }
218
219    /// Returns the page description, trying each source in turn and
220    /// returning the first match:
221    ///
222    /// 1. [`extract_og_description`](Self::extract_og_description) — `og:description`
223    /// 2. [`extract_twitter_description`](Self::extract_twitter_description) — `twitter:description`
224    /// 3. [`extract_description`](Self::extract_description) — `<meta name="description">`
225    pub fn description(&self) -> Option<String> {
226        self.extract_og_description()
227            .or_else(|| self.extract_twitter_description())
228            .or_else(|| self.extract_description())
229    }
230
231    /// Returns the `href` of the first `<link>` whose `rel` contains
232    /// `icon` as a whitespace-separated token.
233    ///
234    /// Matches `rel="icon"`, `rel="shortcut icon"`, `rel="icon shortcut"`,
235    /// and similar forms. Does not match `apple-touch-icon` (that's a
236    /// single different token).
237    ///
238    /// # Example
239    ///
240    /// ```
241    /// # use html_meta_scraper::MetaScraper;
242    /// let m = MetaScraper::new(r#"<link rel="shortcut icon" href="/favicon.ico" />"#);
243    /// assert_eq!(m.favicon().as_deref(), Some("/favicon.ico"));
244    /// ```
245    pub fn favicon(&self) -> Option<String> {
246        let favicon_selector = scraper::Selector::parse("link[rel~='icon']").unwrap();
247
248        let favicon = self
249            .document
250            .select(&favicon_selector)
251            .next()
252            .and_then(|element| element.value().attr("href").map(|href| href.to_string()));
253
254        favicon
255    }
256
257    /// Returns the first OpenGraph image URL from
258    /// `<meta property="og:image">`.
259    ///
260    /// Also accepts the non-conformant `<meta name="og:image">` variant.
261    /// An empty `content` attribute is treated as absent. For pages that
262    /// declare multiple images, see [`extract_og_images`](Self::extract_og_images).
263    ///
264    /// # Example
265    ///
266    /// ```
267    /// # use html_meta_scraper::MetaScraper;
268    /// let m = MetaScraper::new(r#"<meta property="og:image" content="https://example.com/i.jpg" />"#);
269    /// assert_eq!(m.extract_og_image().as_deref(), Some("https://example.com/i.jpg"));
270    /// ```
271    pub fn extract_og_image(&self) -> Option<String> {
272        let og_image_selector =
273            scraper::Selector::parse("meta[property='og:image'], meta[name='og:image']").unwrap();
274
275        let og_image = self
276            .document
277            .select(&og_image_selector)
278            .next()
279            .and_then(|element| element.value().attr("content"))
280            .filter(|content| !content.is_empty())
281            .map(|content| content.to_string());
282
283        og_image
284    }
285
286    /// Returns every OpenGraph image URL in document order.
287    ///
288    /// Both `<meta property="og:image">` and `<meta name="og:image">`
289    /// contribute. Empty `content` attributes are skipped. Returns an
290    /// empty `Vec` if none are declared.
291    ///
292    /// # Example
293    ///
294    /// ```
295    /// # use html_meta_scraper::MetaScraper;
296    /// let m = MetaScraper::new(r#"
297    ///     <meta property="og:image" content="https://example.com/a.jpg" />
298    ///     <meta property="og:image" content="https://example.com/b.png" />
299    /// "#);
300    /// assert_eq!(
301    ///     m.extract_og_images(),
302    ///     vec!["https://example.com/a.jpg", "https://example.com/b.png"],
303    /// );
304    /// ```
305    pub fn extract_og_images(&self) -> Vec<String> {
306        let og_image_selector =
307            scraper::Selector::parse("meta[property='og:image'], meta[name='og:image']").unwrap();
308
309        let og_images = self
310            .document
311            .select(&og_image_selector)
312            .filter_map(|element| element.value().attr("content"))
313            .filter(|content| !content.is_empty())
314            .map(|content| content.to_string())
315            .collect::<Vec<String>>();
316
317        og_images
318    }
319
320    /// Returns the Twitter Card image URL from
321    /// `<meta name="twitter:image">`.
322    ///
323    /// Also accepts `<meta property="twitter:image">`. An empty `content`
324    /// attribute is treated as absent. Related tags such as
325    /// `twitter:image:alt` are not returned.
326    ///
327    /// # Example
328    ///
329    /// ```
330    /// # use html_meta_scraper::MetaScraper;
331    /// let m = MetaScraper::new(r#"<meta name="twitter:image" content="https://example.com/i.jpg" />"#);
332    /// assert_eq!(m.extract_twitter_image().as_deref(), Some("https://example.com/i.jpg"));
333    /// ```
334    pub fn extract_twitter_image(&self) -> Option<String> {
335        let twitter_image_selector =
336            scraper::Selector::parse("meta[name='twitter:image'], meta[property='twitter:image']")
337                .unwrap();
338
339        let twitter_image = self
340            .document
341            .select(&twitter_image_selector)
342            .next()
343            .and_then(|element| element.value().attr("content"))
344            .filter(|content| !content.is_empty())
345            .map(|content| content.to_string());
346
347        twitter_image
348    }
349
350    /// Returns the page image URL, trying each source in turn and
351    /// returning the first match:
352    ///
353    /// 1. [`extract_og_image`](Self::extract_og_image) — `og:image`
354    /// 2. [`extract_twitter_image`](Self::extract_twitter_image) — `twitter:image`
355    ///
356    /// There is no native HTML element to fall back to, so an absent
357    /// result simply means neither tag was declared.
358    pub fn image(&self) -> Option<String> {
359        self.extract_og_image()
360            .or_else(|| self.extract_twitter_image())
361    }
362
363    /// Returns the value of the root `<html lang="...">` attribute.
364    ///
365    /// # Example
366    ///
367    /// ```
368    /// # use html_meta_scraper::MetaScraper;
369    /// let m = MetaScraper::new(r#"<html lang="en"><head></head></html>"#);
370    /// assert_eq!(m.lang().as_deref(), Some("en"));
371    /// ```
372    pub fn lang(&self) -> Option<String> {
373        let html_selector = scraper::Selector::parse("html").unwrap();
374
375        let lang = self
376            .document
377            .select(&html_selector)
378            .next()
379            .and_then(|element| {
380                element
381                    .value()
382                    .attr("lang")
383                    .map(|content| content.to_string())
384            });
385
386        lang
387    }
388}
389
390#[cfg(test)]
391mod test {
392    use super::*;
393
394    #[test]
395    fn extract_title() {
396        let scraper = MetaScraper::new(r#"<title>Page Title</title>"#);
397
398        let title = scraper.extract_title();
399
400        assert_eq!(title, Some("Page Title".to_string()));
401    }
402
403    #[test]
404    fn extract_og_title() {
405        let scraper = MetaScraper::new(r#"<meta property="og:title" content="Page Title" />"#);
406
407        let og_title = scraper.extract_og_title();
408
409        assert_eq!(og_title, Some("Page Title".to_string()));
410    }
411
412    #[test]
413    fn extract_twitter_title() {
414        let scraper = MetaScraper::new(r#"<meta name="twitter:title" content="Page Title" />"#);
415
416        let og_title = scraper.extract_twitter_title();
417
418        assert_eq!(og_title, Some("Page Title".to_string()));
419    }
420
421    #[test]
422    fn extract_description() {
423        let scraper = MetaScraper::new(r#"<meta name="description" content="My Description" />"#);
424
425        let description = scraper.extract_description();
426
427        assert_eq!(description, Some("My Description".to_string()));
428    }
429
430    #[test]
431    fn extract_og_description() {
432        let scraper =
433            MetaScraper::new(r#"<meta property="og:description" content="My Description" />"#);
434
435        let og_description = scraper.extract_og_description();
436
437        assert_eq!(og_description, Some("My Description".to_string()));
438    }
439
440    #[test]
441    fn extract_twitter_description() {
442        let scraper =
443            MetaScraper::new(r#"<meta name="twitter:description" content="My Description" />"#);
444
445        let twitter_description = scraper.extract_twitter_description();
446
447        assert_eq!(twitter_description, Some("My Description".to_string()));
448    }
449
450    #[test]
451    fn favicon() {
452        let scraper = MetaScraper::new(r#"<link rel="icon" href="/favicon.ico" />"#);
453
454        let favicon = scraper.favicon();
455
456        assert_eq!(favicon, Some("/favicon.ico".to_string()));
457    }
458
459    #[test]
460    fn extract_og_image() {
461        let scraper = MetaScraper::new(
462            r#"<meta property="og:image" content="https://example.com/image.jpg" />"#,
463        );
464
465        let og_image = scraper.extract_og_image();
466
467        assert_eq!(og_image, Some("https://example.com/image.jpg".to_string()));
468    }
469
470    #[test]
471    fn extract_og_images() {
472        let scraper = MetaScraper::new(
473            r#"
474            <meta property="og:image" content="https://example.com/image.jpg" />
475            <meta property="og:image" content="https://example.com/image.png" />"#,
476        );
477
478        let og_image = scraper.extract_og_images();
479
480        assert_eq!(
481            og_image,
482            vec![
483                "https://example.com/image.jpg".to_string(),
484                "https://example.com/image.png".to_string()
485            ]
486        );
487    }
488
489    #[test]
490    fn extract_twitter_image() {
491        let scraper = MetaScraper::new(
492            r#"<meta name="twitter:image" content="https://example.com/image.jpg" />"#,
493        );
494
495        let twitter_image = scraper.extract_twitter_image();
496
497        assert_eq!(
498            twitter_image,
499            Some("https://example.com/image.jpg".to_string())
500        );
501    }
502
503    #[test]
504    fn lang() {
505        let scraper = MetaScraper::new(
506            r#"
507            <html lang="en">
508            ...
509            </html>
510        "#,
511        );
512
513        let lang = scraper.lang();
514
515        assert_eq!(lang, Some("en".to_owned()));
516    }
517
518    // ---------------------------------------------------------------------
519    // Bug-reproduction tests. These encode the desired behavior and are
520    // expected to FAIL against the current implementation.
521    // ---------------------------------------------------------------------
522
523    #[test]
524    fn empty_title_tag_returns_none() {
525        let scraper = MetaScraper::new(r#"<title></title>"#);
526        assert_eq!(scraper.extract_title(), None);
527    }
528
529    #[test]
530    fn title_whitespace_is_trimmed() {
531        let scraper = MetaScraper::new("<title>\n  Page Title\n</title>");
532        assert_eq!(scraper.extract_title(), Some("Page Title".to_string()));
533    }
534
535    #[test]
536    fn empty_og_title_content_returns_none() {
537        let scraper = MetaScraper::new(r#"<meta property="og:title" content="" />"#);
538        assert_eq!(scraper.extract_og_title(), None);
539    }
540
541    #[test]
542    fn empty_description_content_returns_none() {
543        let scraper = MetaScraper::new(r#"<meta name="description" content="" />"#);
544        assert_eq!(scraper.extract_description(), None);
545    }
546
547    #[test]
548    fn og_title_with_name_attribute_is_recognized() {
549        // Some CMSes emit `name="og:..."` instead of `property="og:..."`.
550        let scraper = MetaScraper::new(r#"<meta name="og:title" content="Page Title" />"#);
551        assert_eq!(scraper.extract_og_title(), Some("Page Title".to_string()));
552    }
553
554    #[test]
555    fn og_description_with_name_attribute_is_recognized() {
556        let scraper =
557            MetaScraper::new(r#"<meta name="og:description" content="My Description" />"#);
558        assert_eq!(
559            scraper.extract_og_description(),
560            Some("My Description".to_string())
561        );
562    }
563
564    #[test]
565    fn og_image_with_name_attribute_is_recognized() {
566        let scraper =
567            MetaScraper::new(r#"<meta name="og:image" content="https://example.com/i.jpg" />"#);
568        assert_eq!(
569            scraper.extract_og_image(),
570            Some("https://example.com/i.jpg".to_string())
571        );
572    }
573
574    #[test]
575    fn twitter_title_with_property_attribute_is_recognized() {
576        // Mirror case: Twitter tags sometimes appear as `property=`.
577        let scraper = MetaScraper::new(r#"<meta property="twitter:title" content="Page Title" />"#);
578        assert_eq!(
579            scraper.extract_twitter_title(),
580            Some("Page Title".to_string())
581        );
582    }
583
584    #[test]
585    fn twitter_description_with_property_attribute_is_recognized() {
586        let scraper =
587            MetaScraper::new(r#"<meta property="twitter:description" content="My Description" />"#);
588        assert_eq!(
589            scraper.extract_twitter_description(),
590            Some("My Description".to_string())
591        );
592    }
593
594    #[test]
595    fn twitter_image_with_property_attribute_is_recognized() {
596        let scraper = MetaScraper::new(
597            r#"<meta property="twitter:image" content="https://example.com/i.jpg" />"#,
598        );
599        assert_eq!(
600            scraper.extract_twitter_image(),
601            Some("https://example.com/i.jpg".to_string())
602        );
603    }
604
605    #[test]
606    fn favicon_matches_shortcut_icon() {
607        let scraper = MetaScraper::new(r#"<link rel="shortcut icon" href="/favicon.ico" />"#);
608        assert_eq!(scraper.favicon(), Some("/favicon.ico".to_string()));
609    }
610
611    #[test]
612    fn favicon_matches_multi_token_rel() {
613        let scraper = MetaScraper::new(r#"<link rel="icon shortcut" href="/favicon.ico" />"#);
614        assert_eq!(scraper.favicon(), Some("/favicon.ico".to_string()));
615    }
616
617    #[test]
618    fn title_fallback_prefers_og_over_twitter_over_native() {
619        let scraper = MetaScraper::new(
620            r#"
621            <title>Native Title</title>
622            <meta property="og:title" content="OG Title" />
623            <meta name="twitter:title" content="Twitter Title" />
624            "#,
625        );
626        assert_eq!(scraper.title(), Some("OG Title".to_string()));
627
628        let scraper = MetaScraper::new(
629            r#"
630            <title>Native Title</title>
631            <meta name="twitter:title" content="Twitter Title" />
632            "#,
633        );
634        assert_eq!(scraper.title(), Some("Twitter Title".to_string()));
635
636        let scraper = MetaScraper::new(r#"<title>Native Title</title>"#);
637        assert_eq!(scraper.title(), Some("Native Title".to_string()));
638    }
639
640    #[test]
641    fn title_returns_none_when_no_source_present() {
642        let scraper = MetaScraper::new(r#"<html><head></head><body></body></html>"#);
643        assert_eq!(scraper.title(), None);
644    }
645}