1use scraper::{Html, ElementRef};
14use std::collections::HashMap;
15use url::Url;
16
17use crate::selector::{SELECTORS, try_parse_selector};
18use crate::types::{
19 PageMetadata, OpenGraph, TwitterCard, RobotsMeta,
20 AlternateLink, StructuredData,
21 ParserResult,
22};
23
24pub fn extract_metadata(document: &Html, base_url: Option<&Url>) -> ParserResult<PageMetadata> {
30 let metadata = PageMetadata {
31 title: extract_title(document),
32 charset: extract_charset(document),
33 language: extract_language(document),
34 base_url: extract_base_url(document),
35 viewport: extract_meta_content(document, "viewport"),
36 description: extract_meta_content(document, "description"),
37 keywords: extract_keywords(document),
38 author: extract_meta_content(document, "author"),
39 generator: extract_meta_content(document, "generator"),
40 published_date: extract_meta_content(document, "article:published_time")
41 .or_else(|| extract_meta_content(document, "datePublished"))
42 .or_else(|| extract_meta_content(document, "date")),
43 modified_date: extract_meta_content(document, "article:modified_time")
44 .or_else(|| extract_meta_content(document, "dateModified")),
45 canonical: extract_canonical(document, base_url),
46 favicon: extract_favicon(document, base_url),
47 apple_touch_icon: extract_apple_touch_icon(document, base_url),
48 theme_color: extract_meta_content(document, "theme-color"),
49 robots: extract_robots(document),
50 opengraph: extract_opengraph(document),
51 twitter: extract_twitter_card(document),
52 alternates: extract_alternates(document, base_url),
53 schema_type: None,
54 custom: extract_custom_meta(document),
55 };
56
57 Ok(metadata)
58}
59
60pub fn extract_title(document: &Html) -> Option<String> {
66 document
67 .select(&SELECTORS.title)
68 .next()
69 .map(|el| el.text().collect::<String>().trim().to_string())
70 .filter(|s| !s.is_empty())
71}
72
73pub fn extract_charset(document: &Html) -> Option<String> {
75 for meta in document.select(&SELECTORS.meta) {
77 if let Some(charset) = meta.value().attr("charset") {
78 return Some(charset.to_uppercase());
79 }
80 }
81
82 if let Some(sel) = try_parse_selector("meta[http-equiv='Content-Type']") {
84 if let Some(meta) = document.select(&sel).next() {
85 if let Some(content) = meta.value().attr("content") {
86 if let Some(charset_part) = content.split(';')
88 .find(|p| p.trim().to_lowercase().starts_with("charset"))
89 {
90 if let Some(charset) = charset_part.split('=').nth(1) {
91 return Some(charset.trim().to_uppercase());
92 }
93 }
94 }
95 }
96 }
97
98 None
99}
100
101pub fn extract_language(document: &Html) -> Option<String> {
103 document
104 .select(&SELECTORS.html)
105 .next()
106 .and_then(|el| el.value().attr("lang"))
107 .map(|s| s.to_string())
108}
109
110pub fn extract_base_url(document: &Html) -> Option<String> {
112 document
113 .select(&SELECTORS.base)
114 .next()
115 .and_then(|el| el.value().attr("href"))
116 .map(|s| s.to_string())
117}
118
119pub fn extract_meta_content(document: &Html, name: &str) -> Option<String> {
121 let name_selector = format!("meta[name='{}' i]", name);
123 if let Some(sel) = try_parse_selector(&name_selector) {
124 if let Some(meta) = document.select(&sel).next() {
125 if let Some(content) = meta.value().attr("content") {
126 let trimmed = content.trim();
127 if !trimmed.is_empty() {
128 return Some(trimmed.to_string());
129 }
130 }
131 }
132 }
133
134 let prop_selector = format!("meta[property='{}']", name);
136 if let Some(sel) = try_parse_selector(&prop_selector) {
137 if let Some(meta) = document.select(&sel).next() {
138 if let Some(content) = meta.value().attr("content") {
139 let trimmed = content.trim();
140 if !trimmed.is_empty() {
141 return Some(trimmed.to_string());
142 }
143 }
144 }
145 }
146
147 None
148}
149
150pub fn extract_keywords(document: &Html) -> Vec<String> {
152 extract_meta_content(document, "keywords")
153 .map(|s| {
154 s.split(',')
155 .map(|k| k.trim().to_string())
156 .filter(|k| !k.is_empty())
157 .collect()
158 })
159 .unwrap_or_default()
160}
161
162pub fn extract_canonical(document: &Html, base_url: Option<&Url>) -> Option<String> {
168 if let Some(sel) = try_parse_selector("link[rel='canonical']") {
169 if let Some(link) = document.select(&sel).next() {
170 if let Some(href) = link.value().attr("href") {
171 return resolve_url(href, base_url);
172 }
173 }
174 }
175 None
176}
177
178pub fn extract_favicon(document: &Html, base_url: Option<&Url>) -> Option<String> {
180 let selectors = [
182 "link[rel='icon']",
183 "link[rel='shortcut icon']",
184 "link[rel='icon shortcut']",
185 ];
186
187 for sel_str in selectors {
188 if let Some(sel) = try_parse_selector(sel_str) {
189 if let Some(link) = document.select(&sel).next() {
190 if let Some(href) = link.value().attr("href") {
191 return resolve_url(href, base_url);
192 }
193 }
194 }
195 }
196
197 base_url.map(|u| {
199 let mut favicon_url = u.clone();
200 favicon_url.set_path("/favicon.ico");
201 favicon_url.set_query(None);
202 favicon_url.to_string()
203 })
204}
205
206pub fn extract_apple_touch_icon(document: &Html, base_url: Option<&Url>) -> Option<String> {
208 let selectors = [
209 "link[rel='apple-touch-icon']",
210 "link[rel='apple-touch-icon-precomposed']",
211 ];
212
213 for sel_str in selectors {
214 if let Some(sel) = try_parse_selector(sel_str) {
215 let icons: Vec<_> = document.select(&sel).collect();
217 if let Some(icon) = find_largest_icon(&icons) {
218 if let Some(href) = icon.value().attr("href") {
219 return resolve_url(href, base_url);
220 }
221 }
222 }
223 }
224
225 None
226}
227
228fn find_largest_icon<'a>(icons: &'a [ElementRef<'a>]) -> Option<&'a ElementRef<'a>> {
230 if icons.is_empty() {
231 return None;
232 }
233
234 let mut best: Option<(&ElementRef, u32)> = None;
236
237 for icon in icons {
238 let size = icon.value().attr("sizes")
239 .and_then(|s| {
240 let parts: Vec<_> = s.split('x').collect();
242 if parts.len() == 2 {
243 parts[0].parse::<u32>().ok()
244 } else {
245 None
246 }
247 })
248 .unwrap_or(0);
249
250 if best.is_none() || size > best.unwrap().1 {
251 best = Some((icon, size));
252 }
253 }
254
255 best.map(|(el, _)| el)
256}
257
258pub fn extract_alternates(document: &Html, base_url: Option<&Url>) -> Vec<AlternateLink> {
260 let mut alternates = Vec::new();
261
262 if let Some(sel) = try_parse_selector("link[rel='alternate'][hreflang]") {
263 for link in document.select(&sel) {
264 if let (Some(hreflang), Some(href)) = (
265 link.value().attr("hreflang"),
266 link.value().attr("href"),
267 ) {
268 if let Some(resolved) = resolve_url(href, base_url) {
269 alternates.push(AlternateLink {
270 hreflang: hreflang.to_string(),
271 href: resolved,
272 });
273 }
274 }
275 }
276 }
277
278 alternates
279}
280
281pub fn extract_robots(document: &Html) -> RobotsMeta {
287 let mut robots = RobotsMeta::allowed();
288
289 let content = extract_meta_content(document, "robots")
291 .or_else(|| extract_meta_content(document, "googlebot"));
292
293 if let Some(content) = content {
294 robots.raw = Some(content.clone());
295
296 let directives: Vec<_> = content
297 .to_lowercase()
298 .split(',')
299 .map(|s| s.trim().to_string())
300 .collect();
301
302 for directive in &directives {
303 match directive.as_str() {
304 "noindex" => robots.index = false,
305 "nofollow" => robots.follow = false,
306 "none" => {
307 robots.index = false;
308 robots.follow = false;
309 }
310 "noarchive" => robots.archive = false,
311 "nocache" => robots.cache = false,
312 "nosnippet" => robots.snippet = false,
313 _ => {
314 if let Some((key, value)) = directive.split_once(':') {
316 match key {
317 "max-snippet" => {
318 robots.max_snippet = value.parse().unwrap_or(-1);
319 }
320 "max-image-preview" => {
321 robots.max_image_preview = Some(value.to_string());
322 }
323 "max-video-preview" => {
324 robots.max_video_preview = value.parse().unwrap_or(-1);
325 }
326 _ => {}
327 }
328 }
329 }
330 }
331 }
332 }
333
334 robots
335}
336
337pub fn extract_opengraph(document: &Html) -> OpenGraph {
343 OpenGraph {
344 title: extract_og_property(document, "og:title"),
345 og_type: extract_og_property(document, "og:type"),
346 url: extract_og_property(document, "og:url"),
347 image: extract_og_property(document, "og:image"),
348 description: extract_og_property(document, "og:description"),
349 site_name: extract_og_property(document, "og:site_name"),
350 locale: extract_og_property(document, "og:locale"),
351 video: extract_og_property(document, "og:video"),
352 audio: extract_og_property(document, "og:audio"),
353 extra: extract_all_og_properties(document),
354 }
355}
356
357fn extract_og_property(document: &Html, property: &str) -> Option<String> {
359 let selector = format!("meta[property='{}']", property);
360 if let Some(sel) = try_parse_selector(&selector) {
361 if let Some(meta) = document.select(&sel).next() {
362 return meta.value().attr("content")
363 .map(|s| s.trim().to_string())
364 .filter(|s| !s.is_empty());
365 }
366 }
367 None
368}
369
370fn extract_all_og_properties(document: &Html) -> HashMap<String, String> {
372 let mut props = HashMap::new();
373
374 let standard = ["og:title", "og:type", "og:url", "og:image",
376 "og:description", "og:site_name", "og:locale",
377 "og:video", "og:audio"];
378
379 for meta in document.select(&SELECTORS.meta) {
380 if let Some(property) = meta.value().attr("property") {
381 if property.starts_with("og:") && !standard.contains(&property) {
382 if let Some(content) = meta.value().attr("content") {
383 props.insert(property.to_string(), content.to_string());
384 }
385 }
386 }
387 }
388
389 props
390}
391
392pub fn extract_twitter_card(document: &Html) -> TwitterCard {
398 TwitterCard {
399 card: extract_twitter_property(document, "twitter:card"),
400 site: extract_twitter_property(document, "twitter:site"),
401 creator: extract_twitter_property(document, "twitter:creator"),
402 title: extract_twitter_property(document, "twitter:title"),
403 description: extract_twitter_property(document, "twitter:description"),
404 image: extract_twitter_property(document, "twitter:image"),
405 extra: extract_all_twitter_properties(document),
406 }
407}
408
409fn extract_twitter_property(document: &Html, name: &str) -> Option<String> {
411 let prop_selector = format!("meta[property='{}']", name);
413 if let Some(sel) = try_parse_selector(&prop_selector) {
414 if let Some(meta) = document.select(&sel).next() {
415 if let Some(content) = meta.value().attr("content") {
416 let trimmed = content.trim();
417 if !trimmed.is_empty() {
418 return Some(trimmed.to_string());
419 }
420 }
421 }
422 }
423
424 let name_selector = format!("meta[name='{}']", name);
426 if let Some(sel) = try_parse_selector(&name_selector) {
427 if let Some(meta) = document.select(&sel).next() {
428 if let Some(content) = meta.value().attr("content") {
429 let trimmed = content.trim();
430 if !trimmed.is_empty() {
431 return Some(trimmed.to_string());
432 }
433 }
434 }
435 }
436
437 None
438}
439
440fn extract_all_twitter_properties(document: &Html) -> HashMap<String, String> {
442 let mut props = HashMap::new();
443
444 let standard = ["twitter:card", "twitter:site", "twitter:creator",
445 "twitter:title", "twitter:description", "twitter:image"];
446
447 for meta in document.select(&SELECTORS.meta) {
448 let key = meta.value().attr("property")
449 .or_else(|| meta.value().attr("name"));
450
451 if let Some(key) = key {
452 if key.starts_with("twitter:") && !standard.contains(&key) {
453 if let Some(content) = meta.value().attr("content") {
454 props.insert(key.to_string(), content.to_string());
455 }
456 }
457 }
458 }
459
460 props
461}
462
463pub fn extract_structured_data(document: &Html) -> Vec<StructuredData> {
469 let mut data = Vec::new();
470
471 data.extend(extract_json_ld(document));
473
474 data.extend(extract_microdata(document));
476
477 data
478}
479
480pub fn extract_json_ld(document: &Html) -> Vec<StructuredData> {
482 let mut data = Vec::new();
483
484 for script in document.select(&SELECTORS.json_ld) {
485 let raw_json = script.text().collect::<String>();
486 let trimmed = raw_json.trim();
487
488 if trimmed.is_empty() {
489 continue;
490 }
491
492 let mut item = StructuredData::json_ld(trimmed);
493
494 if let Ok(json) = serde_json::from_str::<serde_json::Value>(trimmed) {
496 if let Some(schema_type) = json.get("@type").and_then(|v| v.as_str()) {
497 item.schema_type = Some(schema_type.to_string());
498 }
499
500 if let serde_json::Value::Object(map) = json {
502 for (key, value) in map {
503 item.properties.insert(key, value);
504 }
505 }
506 }
507
508 data.push(item);
509 }
510
511 data
512}
513
514pub fn extract_microdata(document: &Html) -> Vec<StructuredData> {
516 let mut data = Vec::new();
517
518 for item in document.select(&SELECTORS.microdata) {
519 if let Some(itemtype) = item.value().attr("itemtype") {
520 let schema_type = itemtype
522 .rsplit('/')
523 .next()
524 .unwrap_or(itemtype)
525 .to_string();
526
527 let mut structured = StructuredData::microdata(&schema_type);
528
529 if let Some(sel) = try_parse_selector("[itemprop]") {
531 for prop in item.select(&sel) {
532 if let Some(prop_name) = prop.value().attr("itemprop") {
533 let value = prop.value().attr("content")
535 .or_else(|| prop.value().attr("href"))
536 .or_else(|| prop.value().attr("src"))
537 .map(|s| s.to_string())
538 .unwrap_or_else(|| prop.text().collect::<String>().trim().to_string());
539
540 structured.properties.insert(
541 prop_name.to_string(),
542 serde_json::Value::String(value),
543 );
544 }
545 }
546 }
547
548 data.push(structured);
549 }
550 }
551
552 data
553}
554
555fn extract_custom_meta(document: &Html) -> HashMap<String, String> {
561 let mut custom = HashMap::new();
562
563 let standard = [
565 "description", "keywords", "author", "viewport", "robots",
566 "generator", "theme-color", "msapplication-TileColor",
567 ];
568
569 for meta in document.select(&SELECTORS.meta) {
570 if let Some(name) = meta.value().attr("name") {
571 if !standard.contains(&name)
573 && !name.starts_with("og:")
574 && !name.starts_with("twitter:")
575 && !name.starts_with("article:")
576 {
577 if let Some(content) = meta.value().attr("content") {
578 custom.insert(name.to_string(), content.to_string());
579 }
580 }
581 }
582 }
583
584 custom
585}
586
587fn resolve_url(href: &str, base_url: Option<&Url>) -> Option<String> {
593 let trimmed = href.trim();
594
595 if trimmed.is_empty() {
596 return None;
597 }
598
599 if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
601 return Some(trimmed.to_string());
602 }
603
604 if trimmed.starts_with("//") {
606 return Some(format!("https:{}", trimmed));
607 }
608
609 base_url
611 .and_then(|base| base.join(trimmed).ok())
612 .map(|u| u.to_string())
613}
614
615#[cfg(test)]
620mod tests {
621 use super::*;
622 use crate::types::StructuredDataFormat;
623
624 fn parse_html(html: &str) -> Html {
625 Html::parse_document(html)
626 }
627
628 #[test]
629 fn test_extract_title() {
630 let doc = parse_html("<html><head><title>Test Page</title></head></html>");
631 assert_eq!(extract_title(&doc), Some("Test Page".to_string()));
632 }
633
634 #[test]
635 fn test_extract_title_with_whitespace() {
636 let doc = parse_html("<html><head><title> Test Page </title></head></html>");
637 assert_eq!(extract_title(&doc), Some("Test Page".to_string()));
638 }
639
640 #[test]
641 fn test_extract_title_empty() {
642 let doc = parse_html("<html><head><title></title></head></html>");
643 assert_eq!(extract_title(&doc), None);
644 }
645
646 #[test]
647 fn test_extract_charset_meta() {
648 let doc = parse_html("<html><head><meta charset='UTF-8'></head></html>");
649 assert_eq!(extract_charset(&doc), Some("UTF-8".to_string()));
650 }
651
652 #[test]
653 fn test_extract_charset_content_type() {
654 let doc = parse_html(
655 "<html><head><meta http-equiv='Content-Type' content='text/html; charset=iso-8859-1'></head></html>"
656 );
657 assert_eq!(extract_charset(&doc), Some("ISO-8859-1".to_string()));
658 }
659
660 #[test]
661 fn test_extract_language() {
662 let doc = parse_html("<html lang='en-US'><head></head></html>");
663 assert_eq!(extract_language(&doc), Some("en-US".to_string()));
664 }
665
666 #[test]
667 fn test_extract_meta_content() {
668 let doc = parse_html(
669 "<html><head><meta name='description' content='Test description'></head></html>"
670 );
671 assert_eq!(
672 extract_meta_content(&doc, "description"),
673 Some("Test description".to_string())
674 );
675 }
676
677 #[test]
678 fn test_extract_keywords() {
679 let doc = parse_html(
680 "<html><head><meta name='keywords' content='rust, web, scraping'></head></html>"
681 );
682 let keywords = extract_keywords(&doc);
683 assert_eq!(keywords, vec!["rust", "web", "scraping"]);
684 }
685
686 #[test]
687 fn test_extract_canonical() {
688 let doc = parse_html(
689 "<html><head><link rel='canonical' href='https://example.com/page'></head></html>"
690 );
691 assert_eq!(
692 extract_canonical(&doc, None),
693 Some("https://example.com/page".to_string())
694 );
695 }
696
697 #[test]
698 fn test_extract_canonical_relative() {
699 let doc = parse_html(
700 "<html><head><link rel='canonical' href='/page'></head></html>"
701 );
702 let base = Url::parse("https://example.com").unwrap();
703 assert_eq!(
704 extract_canonical(&doc, Some(&base)),
705 Some("https://example.com/page".to_string())
706 );
707 }
708
709 #[test]
710 fn test_extract_robots_default() {
711 let doc = parse_html("<html><head></head></html>");
712 let robots = extract_robots(&doc);
713 assert!(robots.index);
714 assert!(robots.follow);
715 }
716
717 #[test]
718 fn test_extract_robots_noindex() {
719 let doc = parse_html(
720 "<html><head><meta name='robots' content='noindex, nofollow'></head></html>"
721 );
722 let robots = extract_robots(&doc);
723 assert!(!robots.index);
724 assert!(!robots.follow);
725 }
726
727 #[test]
728 fn test_extract_robots_advanced() {
729 let doc = parse_html(
730 "<html><head><meta name='robots' content='noarchive, max-snippet:150, max-image-preview:large'></head></html>"
731 );
732 let robots = extract_robots(&doc);
733 assert!(robots.index);
734 assert!(!robots.archive);
735 assert_eq!(robots.max_snippet, 150);
736 assert_eq!(robots.max_image_preview, Some("large".to_string()));
737 }
738
739 #[test]
740 fn test_extract_opengraph() {
741 let doc = parse_html(r#"
742 <html><head>
743 <meta property="og:title" content="OG Title">
744 <meta property="og:type" content="article">
745 <meta property="og:url" content="https://example.com/article">
746 <meta property="og:image" content="https://example.com/image.jpg">
747 <meta property="og:description" content="OG Description">
748 </head></html>
749 "#);
750
751 let og = extract_opengraph(&doc);
752 assert!(og.is_present());
753 assert_eq!(og.title, Some("OG Title".to_string()));
754 assert_eq!(og.og_type, Some("article".to_string()));
755 assert_eq!(og.url, Some("https://example.com/article".to_string()));
756 }
757
758 #[test]
759 fn test_extract_twitter_card() {
760 let doc = parse_html(r#"
761 <html><head>
762 <meta name="twitter:card" content="summary_large_image">
763 <meta name="twitter:site" content="@example">
764 <meta name="twitter:title" content="Twitter Title">
765 </head></html>
766 "#);
767
768 let twitter = extract_twitter_card(&doc);
769 assert!(twitter.is_present());
770 assert_eq!(twitter.card, Some("summary_large_image".to_string()));
771 assert_eq!(twitter.site, Some("@example".to_string()));
772 }
773
774 #[test]
775 fn test_extract_alternates() {
776 let doc = parse_html(r#"
777 <html><head>
778 <link rel="alternate" hreflang="en" href="https://example.com/en/">
779 <link rel="alternate" hreflang="fr" href="https://example.com/fr/">
780 <link rel="alternate" hreflang="x-default" href="https://example.com/">
781 </head></html>
782 "#);
783
784 let alternates = extract_alternates(&doc, None);
785 assert_eq!(alternates.len(), 3);
786 assert!(alternates.iter().any(|a| a.hreflang == "en"));
787 assert!(alternates.iter().any(|a| a.hreflang == "fr"));
788 assert!(alternates.iter().any(|a| a.hreflang == "x-default"));
789 }
790
791 #[test]
792 fn test_extract_json_ld() {
793 let doc = parse_html(r#"
794 <html><head>
795 <script type="application/ld+json">
796 {
797 "@context": "https://schema.org",
798 "@type": "Article",
799 "headline": "Test Article"
800 }
801 </script>
802 </head></html>
803 "#);
804
805 let data = extract_json_ld(&doc);
806 assert_eq!(data.len(), 1);
807 assert_eq!(data[0].format, StructuredDataFormat::JsonLd);
808 assert_eq!(data[0].schema_type, Some("Article".to_string()));
809 }
810
811 #[test]
812 fn test_extract_microdata() {
813 let doc = parse_html(r#"
814 <div itemscope itemtype="https://schema.org/Person">
815 <span itemprop="name">John Doe</span>
816 <span itemprop="jobTitle">Software Engineer</span>
817 </div>
818 "#);
819
820 let data = extract_microdata(&doc);
821 assert_eq!(data.len(), 1);
822 assert_eq!(data[0].format, StructuredDataFormat::Microdata);
823 assert_eq!(data[0].schema_type, Some("Person".to_string()));
824 }
825
826 #[test]
827 fn test_extract_metadata_full() {
828 let doc = parse_html(r#"
829 <!DOCTYPE html>
830 <html lang="en">
831 <head>
832 <meta charset="UTF-8">
833 <title>Full Test Page</title>
834 <meta name="description" content="A complete test page">
835 <meta name="keywords" content="test, page, rust">
836 <meta name="author" content="Test Author">
837 <meta name="robots" content="index, follow">
838 <link rel="canonical" href="https://example.com/page">
839 <meta property="og:title" content="OG Title">
840 <meta name="twitter:card" content="summary">
841 </head>
842 <body></body>
843 </html>
844 "#);
845
846 let metadata = extract_metadata(&doc, None).unwrap();
847
848 assert_eq!(metadata.title, Some("Full Test Page".to_string()));
849 assert_eq!(metadata.description, Some("A complete test page".to_string()));
850 assert_eq!(metadata.language, Some("en".to_string()));
851 assert_eq!(metadata.charset, Some("UTF-8".to_string()));
852 assert!(metadata.robots.index);
853 assert!(metadata.opengraph.is_present());
854 assert!(metadata.twitter.is_present());
855 }
856
857 #[test]
858 fn test_resolve_url_absolute() {
859 assert_eq!(
860 resolve_url("https://example.com/page", None),
861 Some("https://example.com/page".to_string())
862 );
863 }
864
865 #[test]
866 fn test_resolve_url_protocol_relative() {
867 assert_eq!(
868 resolve_url("//example.com/page", None),
869 Some("https://example.com/page".to_string())
870 );
871 }
872
873 #[test]
874 fn test_resolve_url_relative() {
875 let base = Url::parse("https://example.com/dir/").unwrap();
876 assert_eq!(
877 resolve_url("page.html", Some(&base)),
878 Some("https://example.com/dir/page.html".to_string())
879 );
880 }
881
882 #[test]
883 fn test_favicon_default() {
884 let doc = parse_html("<html><head></head></html>");
885 let base = Url::parse("https://example.com/page").unwrap();
886 let favicon = extract_favicon(&doc, Some(&base));
887 assert_eq!(favicon, Some("https://example.com/favicon.ico".to_string()));
888 }
889
890 #[test]
891 fn test_favicon_explicit() {
892 let doc = parse_html(
893 "<html><head><link rel='icon' href='/icons/favicon.png'></head></html>"
894 );
895 let base = Url::parse("https://example.com").unwrap();
896 let favicon = extract_favicon(&doc, Some(&base));
897 assert_eq!(favicon, Some("https://example.com/icons/favicon.png".to_string()));
898 }
899}