halldyll_parser/
metadata.rs

1//! Metadata extraction for halldyll-parser
2//!
3//! This module handles extraction of:
4//! - Title and meta tags
5//! - OpenGraph metadata
6//! - Twitter Card metadata
7//! - Robots directives
8//! - Canonical URLs
9//! - Hreflang alternates
10//! - Structured data (JSON-LD, Microdata)
11//! - Favicon and icons
12
13use scraper::{Html, ElementRef};
14use std::collections::HashMap;
15use url::Url;
16
17use crate::selector::{SELECTORS, try_parse_selector};
18use crate::types::{
19    PageMetadata, OpenGraph, TwitterCard, RobotsMeta,
20    AlternateLink, StructuredData,
21    ParserResult,
22};
23
24// ============================================================================
25// MAIN EXTRACTION FUNCTION
26// ============================================================================
27
28/// Extract all metadata from an HTML document
29pub fn extract_metadata(document: &Html, base_url: Option<&Url>) -> ParserResult<PageMetadata> {
30    let metadata = PageMetadata {
31        title: extract_title(document),
32        charset: extract_charset(document),
33        language: extract_language(document),
34        base_url: extract_base_url(document),
35        viewport: extract_meta_content(document, "viewport"),
36        description: extract_meta_content(document, "description"),
37        keywords: extract_keywords(document),
38        author: extract_meta_content(document, "author"),
39        generator: extract_meta_content(document, "generator"),
40        published_date: extract_meta_content(document, "article:published_time")
41            .or_else(|| extract_meta_content(document, "datePublished"))
42            .or_else(|| extract_meta_content(document, "date")),
43        modified_date: extract_meta_content(document, "article:modified_time")
44            .or_else(|| extract_meta_content(document, "dateModified")),
45        canonical: extract_canonical(document, base_url),
46        favicon: extract_favicon(document, base_url),
47        apple_touch_icon: extract_apple_touch_icon(document, base_url),
48        theme_color: extract_meta_content(document, "theme-color"),
49        robots: extract_robots(document),
50        opengraph: extract_opengraph(document),
51        twitter: extract_twitter_card(document),
52        alternates: extract_alternates(document, base_url),
53        schema_type: None,
54        custom: extract_custom_meta(document),
55    };
56    
57    Ok(metadata)
58}
59
60// ============================================================================
61// BASIC METADATA
62// ============================================================================
63
64/// Extract page title
65pub fn extract_title(document: &Html) -> Option<String> {
66    document
67        .select(&SELECTORS.title)
68        .next()
69        .map(|el| el.text().collect::<String>().trim().to_string())
70        .filter(|s| !s.is_empty())
71}
72
73/// Extract charset from meta tag
74pub fn extract_charset(document: &Html) -> Option<String> {
75    // Try <meta charset="...">
76    for meta in document.select(&SELECTORS.meta) {
77        if let Some(charset) = meta.value().attr("charset") {
78            return Some(charset.to_uppercase());
79        }
80    }
81    
82    // Try <meta http-equiv="Content-Type" content="...">
83    if let Some(sel) = try_parse_selector("meta[http-equiv='Content-Type']") {
84        if let Some(meta) = document.select(&sel).next() {
85            if let Some(content) = meta.value().attr("content") {
86                // Parse "text/html; charset=utf-8"
87                if let Some(charset_part) = content.split(';')
88                    .find(|p| p.trim().to_lowercase().starts_with("charset"))
89                {
90                    if let Some(charset) = charset_part.split('=').nth(1) {
91                        return Some(charset.trim().to_uppercase());
92                    }
93                }
94            }
95        }
96    }
97    
98    None
99}
100
101/// Extract language from html lang attribute
102pub fn extract_language(document: &Html) -> Option<String> {
103    document
104        .select(&SELECTORS.html)
105        .next()
106        .and_then(|el| el.value().attr("lang"))
107        .map(|s| s.to_string())
108}
109
110/// Extract base URL from <base> tag
111pub fn extract_base_url(document: &Html) -> Option<String> {
112    document
113        .select(&SELECTORS.base)
114        .next()
115        .and_then(|el| el.value().attr("href"))
116        .map(|s| s.to_string())
117}
118
119/// Extract content from a meta tag by name
120pub fn extract_meta_content(document: &Html, name: &str) -> Option<String> {
121    // Try name attribute first
122    let name_selector = format!("meta[name='{}' i]", name);
123    if let Some(sel) = try_parse_selector(&name_selector) {
124        if let Some(meta) = document.select(&sel).next() {
125            if let Some(content) = meta.value().attr("content") {
126                let trimmed = content.trim();
127                if !trimmed.is_empty() {
128                    return Some(trimmed.to_string());
129                }
130            }
131        }
132    }
133    
134    // Try property attribute (for OG tags)
135    let prop_selector = format!("meta[property='{}']", name);
136    if let Some(sel) = try_parse_selector(&prop_selector) {
137        if let Some(meta) = document.select(&sel).next() {
138            if let Some(content) = meta.value().attr("content") {
139                let trimmed = content.trim();
140                if !trimmed.is_empty() {
141                    return Some(trimmed.to_string());
142                }
143            }
144        }
145    }
146    
147    None
148}
149
150/// Extract keywords as a list
151pub fn extract_keywords(document: &Html) -> Vec<String> {
152    extract_meta_content(document, "keywords")
153        .map(|s| {
154            s.split(',')
155                .map(|k| k.trim().to_string())
156                .filter(|k| !k.is_empty())
157                .collect()
158        })
159        .unwrap_or_default()
160}
161
162// ============================================================================
163// LINK EXTRACTION
164// ============================================================================
165
166/// Extract canonical URL
167pub fn extract_canonical(document: &Html, base_url: Option<&Url>) -> Option<String> {
168    if let Some(sel) = try_parse_selector("link[rel='canonical']") {
169        if let Some(link) = document.select(&sel).next() {
170            if let Some(href) = link.value().attr("href") {
171                return resolve_url(href, base_url);
172            }
173        }
174    }
175    None
176}
177
178/// Extract favicon URL
179pub fn extract_favicon(document: &Html, base_url: Option<&Url>) -> Option<String> {
180    // Try various favicon link types
181    let selectors = [
182        "link[rel='icon']",
183        "link[rel='shortcut icon']",
184        "link[rel='icon shortcut']",
185    ];
186    
187    for sel_str in selectors {
188        if let Some(sel) = try_parse_selector(sel_str) {
189            if let Some(link) = document.select(&sel).next() {
190                if let Some(href) = link.value().attr("href") {
191                    return resolve_url(href, base_url);
192                }
193            }
194        }
195    }
196    
197    // Default to /favicon.ico
198    base_url.map(|u| {
199        let mut favicon_url = u.clone();
200        favicon_url.set_path("/favicon.ico");
201        favicon_url.set_query(None);
202        favicon_url.to_string()
203    })
204}
205
206/// Extract Apple touch icon
207pub fn extract_apple_touch_icon(document: &Html, base_url: Option<&Url>) -> Option<String> {
208    let selectors = [
209        "link[rel='apple-touch-icon']",
210        "link[rel='apple-touch-icon-precomposed']",
211    ];
212    
213    for sel_str in selectors {
214        if let Some(sel) = try_parse_selector(sel_str) {
215            // Get the largest icon if multiple exist
216            let icons: Vec<_> = document.select(&sel).collect();
217            if let Some(icon) = find_largest_icon(&icons) {
218                if let Some(href) = icon.value().attr("href") {
219                    return resolve_url(href, base_url);
220                }
221            }
222        }
223    }
224    
225    None
226}
227
228/// Find the largest icon from a list of link elements
229fn find_largest_icon<'a>(icons: &'a [ElementRef<'a>]) -> Option<&'a ElementRef<'a>> {
230    if icons.is_empty() {
231        return None;
232    }
233    
234    // Try to find one with largest sizes attribute
235    let mut best: Option<(&ElementRef, u32)> = None;
236    
237    for icon in icons {
238        let size = icon.value().attr("sizes")
239            .and_then(|s| {
240                // Parse "180x180" format
241                let parts: Vec<_> = s.split('x').collect();
242                if parts.len() == 2 {
243                    parts[0].parse::<u32>().ok()
244                } else {
245                    None
246                }
247            })
248            .unwrap_or(0);
249        
250        if best.is_none() || size > best.unwrap().1 {
251            best = Some((icon, size));
252        }
253    }
254    
255    best.map(|(el, _)| el)
256}
257
258/// Extract alternate language links (hreflang)
259pub fn extract_alternates(document: &Html, base_url: Option<&Url>) -> Vec<AlternateLink> {
260    let mut alternates = Vec::new();
261    
262    if let Some(sel) = try_parse_selector("link[rel='alternate'][hreflang]") {
263        for link in document.select(&sel) {
264            if let (Some(hreflang), Some(href)) = (
265                link.value().attr("hreflang"),
266                link.value().attr("href"),
267            ) {
268                if let Some(resolved) = resolve_url(href, base_url) {
269                    alternates.push(AlternateLink {
270                        hreflang: hreflang.to_string(),
271                        href: resolved,
272                    });
273                }
274            }
275        }
276    }
277    
278    alternates
279}
280
281// ============================================================================
282// ROBOTS META
283// ============================================================================
284
285/// Extract robots meta directives
286pub fn extract_robots(document: &Html) -> RobotsMeta {
287    let mut robots = RobotsMeta::allowed();
288    
289    // Get robots meta content
290    let content = extract_meta_content(document, "robots")
291        .or_else(|| extract_meta_content(document, "googlebot"));
292    
293    if let Some(content) = content {
294        robots.raw = Some(content.clone());
295        
296        let directives: Vec<_> = content
297            .to_lowercase()
298            .split(',')
299            .map(|s| s.trim().to_string())
300            .collect();
301        
302        for directive in &directives {
303            match directive.as_str() {
304                "noindex" => robots.index = false,
305                "nofollow" => robots.follow = false,
306                "none" => {
307                    robots.index = false;
308                    robots.follow = false;
309                }
310                "noarchive" => robots.archive = false,
311                "nocache" => robots.cache = false,
312                "nosnippet" => robots.snippet = false,
313                _ => {
314                    // Handle max-snippet:N, max-image-preview:SIZE, max-video-preview:N
315                    if let Some((key, value)) = directive.split_once(':') {
316                        match key {
317                            "max-snippet" => {
318                                robots.max_snippet = value.parse().unwrap_or(-1);
319                            }
320                            "max-image-preview" => {
321                                robots.max_image_preview = Some(value.to_string());
322                            }
323                            "max-video-preview" => {
324                                robots.max_video_preview = value.parse().unwrap_or(-1);
325                            }
326                            _ => {}
327                        }
328                    }
329                }
330            }
331        }
332    }
333    
334    robots
335}
336
337// ============================================================================
338// OPENGRAPH
339// ============================================================================
340
341/// Extract OpenGraph metadata
342pub fn extract_opengraph(document: &Html) -> OpenGraph {
343    OpenGraph {
344        title: extract_og_property(document, "og:title"),
345        og_type: extract_og_property(document, "og:type"),
346        url: extract_og_property(document, "og:url"),
347        image: extract_og_property(document, "og:image"),
348        description: extract_og_property(document, "og:description"),
349        site_name: extract_og_property(document, "og:site_name"),
350        locale: extract_og_property(document, "og:locale"),
351        video: extract_og_property(document, "og:video"),
352        audio: extract_og_property(document, "og:audio"),
353        extra: extract_all_og_properties(document),
354    }
355}
356
357/// Extract a single OG property
358fn extract_og_property(document: &Html, property: &str) -> Option<String> {
359    let selector = format!("meta[property='{}']", property);
360    if let Some(sel) = try_parse_selector(&selector) {
361        if let Some(meta) = document.select(&sel).next() {
362            return meta.value().attr("content")
363                .map(|s| s.trim().to_string())
364                .filter(|s| !s.is_empty());
365        }
366    }
367    None
368}
369
370/// Extract all og: properties as a map
371fn extract_all_og_properties(document: &Html) -> HashMap<String, String> {
372    let mut props = HashMap::new();
373    
374    // Standard properties to exclude from extra
375    let standard = ["og:title", "og:type", "og:url", "og:image", 
376                   "og:description", "og:site_name", "og:locale", 
377                   "og:video", "og:audio"];
378    
379    for meta in document.select(&SELECTORS.meta) {
380        if let Some(property) = meta.value().attr("property") {
381            if property.starts_with("og:") && !standard.contains(&property) {
382                if let Some(content) = meta.value().attr("content") {
383                    props.insert(property.to_string(), content.to_string());
384                }
385            }
386        }
387    }
388    
389    props
390}
391
392// ============================================================================
393// TWITTER CARD
394// ============================================================================
395
396/// Extract Twitter Card metadata
397pub fn extract_twitter_card(document: &Html) -> TwitterCard {
398    TwitterCard {
399        card: extract_twitter_property(document, "twitter:card"),
400        site: extract_twitter_property(document, "twitter:site"),
401        creator: extract_twitter_property(document, "twitter:creator"),
402        title: extract_twitter_property(document, "twitter:title"),
403        description: extract_twitter_property(document, "twitter:description"),
404        image: extract_twitter_property(document, "twitter:image"),
405        extra: extract_all_twitter_properties(document),
406    }
407}
408
409/// Extract a single Twitter property
410fn extract_twitter_property(document: &Html, name: &str) -> Option<String> {
411    // Try property attribute first (some sites use this)
412    let prop_selector = format!("meta[property='{}']", name);
413    if let Some(sel) = try_parse_selector(&prop_selector) {
414        if let Some(meta) = document.select(&sel).next() {
415            if let Some(content) = meta.value().attr("content") {
416                let trimmed = content.trim();
417                if !trimmed.is_empty() {
418                    return Some(trimmed.to_string());
419                }
420            }
421        }
422    }
423    
424    // Then try name attribute
425    let name_selector = format!("meta[name='{}']", name);
426    if let Some(sel) = try_parse_selector(&name_selector) {
427        if let Some(meta) = document.select(&sel).next() {
428            if let Some(content) = meta.value().attr("content") {
429                let trimmed = content.trim();
430                if !trimmed.is_empty() {
431                    return Some(trimmed.to_string());
432                }
433            }
434        }
435    }
436    
437    None
438}
439
440/// Extract all twitter: properties as a map
441fn extract_all_twitter_properties(document: &Html) -> HashMap<String, String> {
442    let mut props = HashMap::new();
443    
444    let standard = ["twitter:card", "twitter:site", "twitter:creator",
445                   "twitter:title", "twitter:description", "twitter:image"];
446    
447    for meta in document.select(&SELECTORS.meta) {
448        let key = meta.value().attr("property")
449            .or_else(|| meta.value().attr("name"));
450        
451        if let Some(key) = key {
452            if key.starts_with("twitter:") && !standard.contains(&key) {
453                if let Some(content) = meta.value().attr("content") {
454                    props.insert(key.to_string(), content.to_string());
455                }
456            }
457        }
458    }
459    
460    props
461}
462
463// ============================================================================
464// STRUCTURED DATA
465// ============================================================================
466
467/// Extract all structured data from the document
468pub fn extract_structured_data(document: &Html) -> Vec<StructuredData> {
469    let mut data = Vec::new();
470    
471    // Extract JSON-LD
472    data.extend(extract_json_ld(document));
473    
474    // Extract Microdata
475    data.extend(extract_microdata(document));
476    
477    data
478}
479
480/// Extract JSON-LD structured data
481pub fn extract_json_ld(document: &Html) -> Vec<StructuredData> {
482    let mut data = Vec::new();
483    
484    for script in document.select(&SELECTORS.json_ld) {
485        let raw_json = script.text().collect::<String>();
486        let trimmed = raw_json.trim();
487        
488        if trimmed.is_empty() {
489            continue;
490        }
491        
492        let mut item = StructuredData::json_ld(trimmed);
493        
494        // Try to parse and extract @type
495        if let Ok(json) = serde_json::from_str::<serde_json::Value>(trimmed) {
496            if let Some(schema_type) = json.get("@type").and_then(|v| v.as_str()) {
497                item.schema_type = Some(schema_type.to_string());
498            }
499            
500            // Store parsed properties
501            if let serde_json::Value::Object(map) = json {
502                for (key, value) in map {
503                    item.properties.insert(key, value);
504                }
505            }
506        }
507        
508        data.push(item);
509    }
510    
511    data
512}
513
514/// Extract Microdata structured data
515pub fn extract_microdata(document: &Html) -> Vec<StructuredData> {
516    let mut data = Vec::new();
517    
518    for item in document.select(&SELECTORS.microdata) {
519        if let Some(itemtype) = item.value().attr("itemtype") {
520            // Extract schema type from URL (e.g., "https://schema.org/Article" -> "Article")
521            let schema_type = itemtype
522                .rsplit('/')
523                .next()
524                .unwrap_or(itemtype)
525                .to_string();
526            
527            let mut structured = StructuredData::microdata(&schema_type);
528            
529            // Extract itemprop values
530            if let Some(sel) = try_parse_selector("[itemprop]") {
531                for prop in item.select(&sel) {
532                    if let Some(prop_name) = prop.value().attr("itemprop") {
533                        // Get value from content attribute or text
534                        let value = prop.value().attr("content")
535                            .or_else(|| prop.value().attr("href"))
536                            .or_else(|| prop.value().attr("src"))
537                            .map(|s| s.to_string())
538                            .unwrap_or_else(|| prop.text().collect::<String>().trim().to_string());
539                        
540                        structured.properties.insert(
541                            prop_name.to_string(),
542                            serde_json::Value::String(value),
543                        );
544                    }
545                }
546            }
547            
548            data.push(structured);
549        }
550    }
551    
552    data
553}
554
555// ============================================================================
556// CUSTOM META
557// ============================================================================
558
559/// Extract custom/application-specific meta tags
560fn extract_custom_meta(document: &Html) -> HashMap<String, String> {
561    let mut custom = HashMap::new();
562    
563    // Skip standard meta names
564    let standard = [
565        "description", "keywords", "author", "viewport", "robots",
566        "generator", "theme-color", "msapplication-TileColor",
567    ];
568    
569    for meta in document.select(&SELECTORS.meta) {
570        if let Some(name) = meta.value().attr("name") {
571            // Skip standard, OG, and Twitter tags
572            if !standard.contains(&name) 
573               && !name.starts_with("og:")
574               && !name.starts_with("twitter:")
575               && !name.starts_with("article:")
576            {
577                if let Some(content) = meta.value().attr("content") {
578                    custom.insert(name.to_string(), content.to_string());
579                }
580            }
581        }
582    }
583    
584    custom
585}
586
587// ============================================================================
588// URL UTILITIES
589// ============================================================================
590
591/// Resolve a relative URL to absolute
592fn resolve_url(href: &str, base_url: Option<&Url>) -> Option<String> {
593    let trimmed = href.trim();
594    
595    if trimmed.is_empty() {
596        return None;
597    }
598    
599    // Already absolute
600    if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
601        return Some(trimmed.to_string());
602    }
603    
604    // Protocol-relative
605    if trimmed.starts_with("//") {
606        return Some(format!("https:{}", trimmed));
607    }
608    
609    // Resolve relative to base
610    base_url
611        .and_then(|base| base.join(trimmed).ok())
612        .map(|u| u.to_string())
613}
614
615// ============================================================================
616// TESTS
617// ============================================================================
618
619#[cfg(test)]
620mod tests {
621    use super::*;
622    use crate::types::StructuredDataFormat;
623
624    fn parse_html(html: &str) -> Html {
625        Html::parse_document(html)
626    }
627
628    #[test]
629    fn test_extract_title() {
630        let doc = parse_html("<html><head><title>Test Page</title></head></html>");
631        assert_eq!(extract_title(&doc), Some("Test Page".to_string()));
632    }
633
634    #[test]
635    fn test_extract_title_with_whitespace() {
636        let doc = parse_html("<html><head><title>  Test Page  </title></head></html>");
637        assert_eq!(extract_title(&doc), Some("Test Page".to_string()));
638    }
639
640    #[test]
641    fn test_extract_title_empty() {
642        let doc = parse_html("<html><head><title></title></head></html>");
643        assert_eq!(extract_title(&doc), None);
644    }
645
646    #[test]
647    fn test_extract_charset_meta() {
648        let doc = parse_html("<html><head><meta charset='UTF-8'></head></html>");
649        assert_eq!(extract_charset(&doc), Some("UTF-8".to_string()));
650    }
651
652    #[test]
653    fn test_extract_charset_content_type() {
654        let doc = parse_html(
655            "<html><head><meta http-equiv='Content-Type' content='text/html; charset=iso-8859-1'></head></html>"
656        );
657        assert_eq!(extract_charset(&doc), Some("ISO-8859-1".to_string()));
658    }
659
660    #[test]
661    fn test_extract_language() {
662        let doc = parse_html("<html lang='en-US'><head></head></html>");
663        assert_eq!(extract_language(&doc), Some("en-US".to_string()));
664    }
665
666    #[test]
667    fn test_extract_meta_content() {
668        let doc = parse_html(
669            "<html><head><meta name='description' content='Test description'></head></html>"
670        );
671        assert_eq!(
672            extract_meta_content(&doc, "description"),
673            Some("Test description".to_string())
674        );
675    }
676
677    #[test]
678    fn test_extract_keywords() {
679        let doc = parse_html(
680            "<html><head><meta name='keywords' content='rust, web, scraping'></head></html>"
681        );
682        let keywords = extract_keywords(&doc);
683        assert_eq!(keywords, vec!["rust", "web", "scraping"]);
684    }
685
686    #[test]
687    fn test_extract_canonical() {
688        let doc = parse_html(
689            "<html><head><link rel='canonical' href='https://example.com/page'></head></html>"
690        );
691        assert_eq!(
692            extract_canonical(&doc, None),
693            Some("https://example.com/page".to_string())
694        );
695    }
696
697    #[test]
698    fn test_extract_canonical_relative() {
699        let doc = parse_html(
700            "<html><head><link rel='canonical' href='/page'></head></html>"
701        );
702        let base = Url::parse("https://example.com").unwrap();
703        assert_eq!(
704            extract_canonical(&doc, Some(&base)),
705            Some("https://example.com/page".to_string())
706        );
707    }
708
709    #[test]
710    fn test_extract_robots_default() {
711        let doc = parse_html("<html><head></head></html>");
712        let robots = extract_robots(&doc);
713        assert!(robots.index);
714        assert!(robots.follow);
715    }
716
717    #[test]
718    fn test_extract_robots_noindex() {
719        let doc = parse_html(
720            "<html><head><meta name='robots' content='noindex, nofollow'></head></html>"
721        );
722        let robots = extract_robots(&doc);
723        assert!(!robots.index);
724        assert!(!robots.follow);
725    }
726
727    #[test]
728    fn test_extract_robots_advanced() {
729        let doc = parse_html(
730            "<html><head><meta name='robots' content='noarchive, max-snippet:150, max-image-preview:large'></head></html>"
731        );
732        let robots = extract_robots(&doc);
733        assert!(robots.index);
734        assert!(!robots.archive);
735        assert_eq!(robots.max_snippet, 150);
736        assert_eq!(robots.max_image_preview, Some("large".to_string()));
737    }
738
739    #[test]
740    fn test_extract_opengraph() {
741        let doc = parse_html(r#"
742            <html><head>
743                <meta property="og:title" content="OG Title">
744                <meta property="og:type" content="article">
745                <meta property="og:url" content="https://example.com/article">
746                <meta property="og:image" content="https://example.com/image.jpg">
747                <meta property="og:description" content="OG Description">
748            </head></html>
749        "#);
750        
751        let og = extract_opengraph(&doc);
752        assert!(og.is_present());
753        assert_eq!(og.title, Some("OG Title".to_string()));
754        assert_eq!(og.og_type, Some("article".to_string()));
755        assert_eq!(og.url, Some("https://example.com/article".to_string()));
756    }
757
758    #[test]
759    fn test_extract_twitter_card() {
760        let doc = parse_html(r#"
761            <html><head>
762                <meta name="twitter:card" content="summary_large_image">
763                <meta name="twitter:site" content="@example">
764                <meta name="twitter:title" content="Twitter Title">
765            </head></html>
766        "#);
767        
768        let twitter = extract_twitter_card(&doc);
769        assert!(twitter.is_present());
770        assert_eq!(twitter.card, Some("summary_large_image".to_string()));
771        assert_eq!(twitter.site, Some("@example".to_string()));
772    }
773
774    #[test]
775    fn test_extract_alternates() {
776        let doc = parse_html(r#"
777            <html><head>
778                <link rel="alternate" hreflang="en" href="https://example.com/en/">
779                <link rel="alternate" hreflang="fr" href="https://example.com/fr/">
780                <link rel="alternate" hreflang="x-default" href="https://example.com/">
781            </head></html>
782        "#);
783        
784        let alternates = extract_alternates(&doc, None);
785        assert_eq!(alternates.len(), 3);
786        assert!(alternates.iter().any(|a| a.hreflang == "en"));
787        assert!(alternates.iter().any(|a| a.hreflang == "fr"));
788        assert!(alternates.iter().any(|a| a.hreflang == "x-default"));
789    }
790
791    #[test]
792    fn test_extract_json_ld() {
793        let doc = parse_html(r#"
794            <html><head>
795                <script type="application/ld+json">
796                {
797                    "@context": "https://schema.org",
798                    "@type": "Article",
799                    "headline": "Test Article"
800                }
801                </script>
802            </head></html>
803        "#);
804        
805        let data = extract_json_ld(&doc);
806        assert_eq!(data.len(), 1);
807        assert_eq!(data[0].format, StructuredDataFormat::JsonLd);
808        assert_eq!(data[0].schema_type, Some("Article".to_string()));
809    }
810
811    #[test]
812    fn test_extract_microdata() {
813        let doc = parse_html(r#"
814            <div itemscope itemtype="https://schema.org/Person">
815                <span itemprop="name">John Doe</span>
816                <span itemprop="jobTitle">Software Engineer</span>
817            </div>
818        "#);
819        
820        let data = extract_microdata(&doc);
821        assert_eq!(data.len(), 1);
822        assert_eq!(data[0].format, StructuredDataFormat::Microdata);
823        assert_eq!(data[0].schema_type, Some("Person".to_string()));
824    }
825
826    #[test]
827    fn test_extract_metadata_full() {
828        let doc = parse_html(r#"
829            <!DOCTYPE html>
830            <html lang="en">
831            <head>
832                <meta charset="UTF-8">
833                <title>Full Test Page</title>
834                <meta name="description" content="A complete test page">
835                <meta name="keywords" content="test, page, rust">
836                <meta name="author" content="Test Author">
837                <meta name="robots" content="index, follow">
838                <link rel="canonical" href="https://example.com/page">
839                <meta property="og:title" content="OG Title">
840                <meta name="twitter:card" content="summary">
841            </head>
842            <body></body>
843            </html>
844        "#);
845        
846        let metadata = extract_metadata(&doc, None).unwrap();
847        
848        assert_eq!(metadata.title, Some("Full Test Page".to_string()));
849        assert_eq!(metadata.description, Some("A complete test page".to_string()));
850        assert_eq!(metadata.language, Some("en".to_string()));
851        assert_eq!(metadata.charset, Some("UTF-8".to_string()));
852        assert!(metadata.robots.index);
853        assert!(metadata.opengraph.is_present());
854        assert!(metadata.twitter.is_present());
855    }
856
857    #[test]
858    fn test_resolve_url_absolute() {
859        assert_eq!(
860            resolve_url("https://example.com/page", None),
861            Some("https://example.com/page".to_string())
862        );
863    }
864
865    #[test]
866    fn test_resolve_url_protocol_relative() {
867        assert_eq!(
868            resolve_url("//example.com/page", None),
869            Some("https://example.com/page".to_string())
870        );
871    }
872
873    #[test]
874    fn test_resolve_url_relative() {
875        let base = Url::parse("https://example.com/dir/").unwrap();
876        assert_eq!(
877            resolve_url("page.html", Some(&base)),
878            Some("https://example.com/dir/page.html".to_string())
879        );
880    }
881
882    #[test]
883    fn test_favicon_default() {
884        let doc = parse_html("<html><head></head></html>");
885        let base = Url::parse("https://example.com/page").unwrap();
886        let favicon = extract_favicon(&doc, Some(&base));
887        assert_eq!(favicon, Some("https://example.com/favicon.ico".to_string()));
888    }
889
890    #[test]
891    fn test_favicon_explicit() {
892        let doc = parse_html(
893            "<html><head><link rel='icon' href='/icons/favicon.png'></head></html>"
894        );
895        let base = Url::parse("https://example.com").unwrap();
896        let favicon = extract_favicon(&doc, Some(&base));
897        assert_eq!(favicon, Some("https://example.com/icons/favicon.png".to_string()));
898    }
899}
halldyll_parser/metadata.rs

halldyll_parser/
metadata.rs