1pub struct MetaScraper {
34 document: scraper::Html,
35}
36
37impl MetaScraper {
38 pub fn new(html: &str) -> Self {
42 let document = scraper::Html::parse_document(html);
43
44 MetaScraper { document }
45 }
46
47 pub fn extract_title(&self) -> Option<String> {
59 self.document
60 .select(&scraper::Selector::parse("title").unwrap())
61 .next()
62 .map(|element| element.text().collect::<String>().trim().to_string())
63 .filter(|s| !s.is_empty())
64 }
65
66 pub fn extract_og_title(&self) -> Option<String> {
80 let og_title_selector =
81 scraper::Selector::parse("meta[property='og:title'], meta[name='og:title']").unwrap();
82
83 let og_title = self
84 .document
85 .select(&og_title_selector)
86 .next()
87 .and_then(|element| element.value().attr("content"))
88 .filter(|content| !content.is_empty())
89 .map(|content| content.to_string());
90
91 og_title
92 }
93
94 pub fn extract_twitter_title(&self) -> Option<String> {
107 let twitter_title_selector =
108 scraper::Selector::parse("meta[name='twitter:title'], meta[property='twitter:title']")
109 .unwrap();
110
111 let twitter_title = self
112 .document
113 .select(&twitter_title_selector)
114 .next()
115 .and_then(|element| element.value().attr("content"))
116 .filter(|content| !content.is_empty())
117 .map(|content| content.to_string());
118
119 twitter_title
120 }
121
122 pub fn title(&self) -> Option<String> {
129 self.extract_og_title()
130 .or_else(|| self.extract_twitter_title())
131 .or_else(|| self.extract_title())
132 }
133
134 pub fn extract_description(&self) -> Option<String> {
146 let description_selector = scraper::Selector::parse("meta[name='description']").unwrap();
147
148 let description = self
149 .document
150 .select(&description_selector)
151 .next()
152 .and_then(|element| element.value().attr("content"))
153 .filter(|content| !content.is_empty())
154 .map(|content| content.to_string());
155
156 description
157 }
158
159 pub fn extract_og_description(&self) -> Option<String> {
173 let og_description_selector = scraper::Selector::parse(
174 "meta[property='og:description'], meta[name='og:description']",
175 )
176 .unwrap();
177
178 let og_description = self
179 .document
180 .select(&og_description_selector)
181 .next()
182 .and_then(|element| element.value().attr("content"))
183 .filter(|content| !content.is_empty())
184 .map(|content| content.to_string());
185
186 og_description
187 }
188
189 pub fn extract_twitter_description(&self) -> Option<String> {
203 let twitter_description_selector = scraper::Selector::parse(
204 "meta[name='twitter:description'], meta[property='twitter:description']",
205 )
206 .unwrap();
207
208 let twitter_description = self
209 .document
210 .select(&twitter_description_selector)
211 .next()
212 .and_then(|element| element.value().attr("content"))
213 .filter(|content| !content.is_empty())
214 .map(|content| content.to_string());
215
216 twitter_description
217 }
218
219 pub fn description(&self) -> Option<String> {
226 self.extract_og_description()
227 .or_else(|| self.extract_twitter_description())
228 .or_else(|| self.extract_description())
229 }
230
231 pub fn favicon(&self) -> Option<String> {
246 let favicon_selector = scraper::Selector::parse("link[rel~='icon']").unwrap();
247
248 let favicon = self
249 .document
250 .select(&favicon_selector)
251 .next()
252 .and_then(|element| element.value().attr("href").map(|href| href.to_string()));
253
254 favicon
255 }
256
257 pub fn extract_og_image(&self) -> Option<String> {
272 let og_image_selector =
273 scraper::Selector::parse("meta[property='og:image'], meta[name='og:image']").unwrap();
274
275 let og_image = self
276 .document
277 .select(&og_image_selector)
278 .next()
279 .and_then(|element| element.value().attr("content"))
280 .filter(|content| !content.is_empty())
281 .map(|content| content.to_string());
282
283 og_image
284 }
285
286 pub fn extract_og_images(&self) -> Vec<String> {
306 let og_image_selector =
307 scraper::Selector::parse("meta[property='og:image'], meta[name='og:image']").unwrap();
308
309 let og_images = self
310 .document
311 .select(&og_image_selector)
312 .filter_map(|element| element.value().attr("content"))
313 .filter(|content| !content.is_empty())
314 .map(|content| content.to_string())
315 .collect::<Vec<String>>();
316
317 og_images
318 }
319
320 pub fn extract_twitter_image(&self) -> Option<String> {
335 let twitter_image_selector =
336 scraper::Selector::parse("meta[name='twitter:image'], meta[property='twitter:image']")
337 .unwrap();
338
339 let twitter_image = self
340 .document
341 .select(&twitter_image_selector)
342 .next()
343 .and_then(|element| element.value().attr("content"))
344 .filter(|content| !content.is_empty())
345 .map(|content| content.to_string());
346
347 twitter_image
348 }
349
350 pub fn image(&self) -> Option<String> {
359 self.extract_og_image()
360 .or_else(|| self.extract_twitter_image())
361 }
362
363 pub fn lang(&self) -> Option<String> {
373 let html_selector = scraper::Selector::parse("html").unwrap();
374
375 let lang = self
376 .document
377 .select(&html_selector)
378 .next()
379 .and_then(|element| {
380 element
381 .value()
382 .attr("lang")
383 .map(|content| content.to_string())
384 });
385
386 lang
387 }
388}
389
390#[cfg(test)]
391mod test {
392 use super::*;
393
394 #[test]
395 fn extract_title() {
396 let scraper = MetaScraper::new(r#"<title>Page Title</title>"#);
397
398 let title = scraper.extract_title();
399
400 assert_eq!(title, Some("Page Title".to_string()));
401 }
402
403 #[test]
404 fn extract_og_title() {
405 let scraper = MetaScraper::new(r#"<meta property="og:title" content="Page Title" />"#);
406
407 let og_title = scraper.extract_og_title();
408
409 assert_eq!(og_title, Some("Page Title".to_string()));
410 }
411
412 #[test]
413 fn extract_twitter_title() {
414 let scraper = MetaScraper::new(r#"<meta name="twitter:title" content="Page Title" />"#);
415
416 let og_title = scraper.extract_twitter_title();
417
418 assert_eq!(og_title, Some("Page Title".to_string()));
419 }
420
421 #[test]
422 fn extract_description() {
423 let scraper = MetaScraper::new(r#"<meta name="description" content="My Description" />"#);
424
425 let description = scraper.extract_description();
426
427 assert_eq!(description, Some("My Description".to_string()));
428 }
429
430 #[test]
431 fn extract_og_description() {
432 let scraper =
433 MetaScraper::new(r#"<meta property="og:description" content="My Description" />"#);
434
435 let og_description = scraper.extract_og_description();
436
437 assert_eq!(og_description, Some("My Description".to_string()));
438 }
439
440 #[test]
441 fn extract_twitter_description() {
442 let scraper =
443 MetaScraper::new(r#"<meta name="twitter:description" content="My Description" />"#);
444
445 let twitter_description = scraper.extract_twitter_description();
446
447 assert_eq!(twitter_description, Some("My Description".to_string()));
448 }
449
450 #[test]
451 fn favicon() {
452 let scraper = MetaScraper::new(r#"<link rel="icon" href="/favicon.ico" />"#);
453
454 let favicon = scraper.favicon();
455
456 assert_eq!(favicon, Some("/favicon.ico".to_string()));
457 }
458
459 #[test]
460 fn extract_og_image() {
461 let scraper = MetaScraper::new(
462 r#"<meta property="og:image" content="https://example.com/image.jpg" />"#,
463 );
464
465 let og_image = scraper.extract_og_image();
466
467 assert_eq!(og_image, Some("https://example.com/image.jpg".to_string()));
468 }
469
470 #[test]
471 fn extract_og_images() {
472 let scraper = MetaScraper::new(
473 r#"
474 <meta property="og:image" content="https://example.com/image.jpg" />
475 <meta property="og:image" content="https://example.com/image.png" />"#,
476 );
477
478 let og_image = scraper.extract_og_images();
479
480 assert_eq!(
481 og_image,
482 vec![
483 "https://example.com/image.jpg".to_string(),
484 "https://example.com/image.png".to_string()
485 ]
486 );
487 }
488
489 #[test]
490 fn extract_twitter_image() {
491 let scraper = MetaScraper::new(
492 r#"<meta name="twitter:image" content="https://example.com/image.jpg" />"#,
493 );
494
495 let twitter_image = scraper.extract_twitter_image();
496
497 assert_eq!(
498 twitter_image,
499 Some("https://example.com/image.jpg".to_string())
500 );
501 }
502
503 #[test]
504 fn lang() {
505 let scraper = MetaScraper::new(
506 r#"
507 <html lang="en">
508 ...
509 </html>
510 "#,
511 );
512
513 let lang = scraper.lang();
514
515 assert_eq!(lang, Some("en".to_owned()));
516 }
517
518 #[test]
524 fn empty_title_tag_returns_none() {
525 let scraper = MetaScraper::new(r#"<title></title>"#);
526 assert_eq!(scraper.extract_title(), None);
527 }
528
529 #[test]
530 fn title_whitespace_is_trimmed() {
531 let scraper = MetaScraper::new("<title>\n Page Title\n</title>");
532 assert_eq!(scraper.extract_title(), Some("Page Title".to_string()));
533 }
534
535 #[test]
536 fn empty_og_title_content_returns_none() {
537 let scraper = MetaScraper::new(r#"<meta property="og:title" content="" />"#);
538 assert_eq!(scraper.extract_og_title(), None);
539 }
540
541 #[test]
542 fn empty_description_content_returns_none() {
543 let scraper = MetaScraper::new(r#"<meta name="description" content="" />"#);
544 assert_eq!(scraper.extract_description(), None);
545 }
546
547 #[test]
548 fn og_title_with_name_attribute_is_recognized() {
549 let scraper = MetaScraper::new(r#"<meta name="og:title" content="Page Title" />"#);
551 assert_eq!(scraper.extract_og_title(), Some("Page Title".to_string()));
552 }
553
554 #[test]
555 fn og_description_with_name_attribute_is_recognized() {
556 let scraper =
557 MetaScraper::new(r#"<meta name="og:description" content="My Description" />"#);
558 assert_eq!(
559 scraper.extract_og_description(),
560 Some("My Description".to_string())
561 );
562 }
563
564 #[test]
565 fn og_image_with_name_attribute_is_recognized() {
566 let scraper =
567 MetaScraper::new(r#"<meta name="og:image" content="https://example.com/i.jpg" />"#);
568 assert_eq!(
569 scraper.extract_og_image(),
570 Some("https://example.com/i.jpg".to_string())
571 );
572 }
573
574 #[test]
575 fn twitter_title_with_property_attribute_is_recognized() {
576 let scraper = MetaScraper::new(r#"<meta property="twitter:title" content="Page Title" />"#);
578 assert_eq!(
579 scraper.extract_twitter_title(),
580 Some("Page Title".to_string())
581 );
582 }
583
584 #[test]
585 fn twitter_description_with_property_attribute_is_recognized() {
586 let scraper =
587 MetaScraper::new(r#"<meta property="twitter:description" content="My Description" />"#);
588 assert_eq!(
589 scraper.extract_twitter_description(),
590 Some("My Description".to_string())
591 );
592 }
593
594 #[test]
595 fn twitter_image_with_property_attribute_is_recognized() {
596 let scraper = MetaScraper::new(
597 r#"<meta property="twitter:image" content="https://example.com/i.jpg" />"#,
598 );
599 assert_eq!(
600 scraper.extract_twitter_image(),
601 Some("https://example.com/i.jpg".to_string())
602 );
603 }
604
605 #[test]
606 fn favicon_matches_shortcut_icon() {
607 let scraper = MetaScraper::new(r#"<link rel="shortcut icon" href="/favicon.ico" />"#);
608 assert_eq!(scraper.favicon(), Some("/favicon.ico".to_string()));
609 }
610
611 #[test]
612 fn favicon_matches_multi_token_rel() {
613 let scraper = MetaScraper::new(r#"<link rel="icon shortcut" href="/favicon.ico" />"#);
614 assert_eq!(scraper.favicon(), Some("/favicon.ico".to_string()));
615 }
616
617 #[test]
618 fn title_fallback_prefers_og_over_twitter_over_native() {
619 let scraper = MetaScraper::new(
620 r#"
621 <title>Native Title</title>
622 <meta property="og:title" content="OG Title" />
623 <meta name="twitter:title" content="Twitter Title" />
624 "#,
625 );
626 assert_eq!(scraper.title(), Some("OG Title".to_string()));
627
628 let scraper = MetaScraper::new(
629 r#"
630 <title>Native Title</title>
631 <meta name="twitter:title" content="Twitter Title" />
632 "#,
633 );
634 assert_eq!(scraper.title(), Some("Twitter Title".to_string()));
635
636 let scraper = MetaScraper::new(r#"<title>Native Title</title>"#);
637 assert_eq!(scraper.title(), Some("Native Title".to_string()));
638 }
639
640 #[test]
641 fn title_returns_none_when_no_source_present() {
642 let scraper = MetaScraper::new(r#"<html><head></head><body></body></html>"#);
643 assert_eq!(scraper.title(), None);
644 }
645}