Skip to main content

schemaorg_rs/extraction/
microdata.rs

1//! Microdata extractor: parses `itemscope`/`itemprop` attributes.
2//!
3//! Implements the [W3C Microdata to RDF](https://www.w3.org/TR/microdata-rdf/)
4//! extraction algorithm using `scraper` for DOM traversal.
5//!
6//! ## Supported features
7//!
8//! - `itemscope` / `itemtype` for defining nodes
9//! - `itemprop` for defining properties
10//! - Nested item scopes (property that is itself an item)
11//! - `itemref` for non-contiguous DOM references
12//! - `itemid` for global identifiers
13//! - Space-separated `itemprop` values (one element -> multiple properties)
14//! - Value extraction by element type (`<meta>`, `<a>`, `<img>`, `<time>`, etc.)
15//!
16//! ## Depth limit
17//!
18//! Nested scopes are limited to 20 levels to prevent stack overflow on
19//! malformed markup with circular or excessively deep nesting.
20
21use std::sync::OnceLock;
22
23use ego_tree::NodeRef;
24use indexmap::IndexMap;
25use scraper::node::Node;
26use scraper::{Html, Selector};
27
28use crate::error::{ExtractionError, ExtractionWarning, WarningCode};
29use crate::types::{SchemaNode, SchemaValue, SourceFormat};
30
31use super::{classify_text_value, strip_schema_prefix, ExtractionOutput, Extractor};
32
33/// Maximum nesting depth for Microdata scopes.
34const MAX_DEPTH: usize = 20;
35
36/// Extracts Schema.org structured data from HTML Microdata attributes.
37///
38/// # Examples
39///
40/// ```
41/// use schemaorg_rs::extraction::{Extractor, MicrodataExtractor};
42///
43/// let html = r#"<html><body>
44/// <div itemscope itemtype="https://schema.org/Product">
45/// <span itemprop="name">Widget</span>
46/// </div>
47/// </body></html>"#;
48///
49/// let output = MicrodataExtractor.extract(html).unwrap();
50/// assert_eq!(output.nodes[0].types, vec!["Product"]);
51/// ```
52pub struct MicrodataExtractor;
53
54impl Extractor for MicrodataExtractor {
55    fn extract(&self, html: &str) -> Result<ExtractionOutput, ExtractionError> {
56        let document = Html::parse_document(html);
57        self.extract_from_document(&document)
58    }
59}
60
61impl MicrodataExtractor {
62    /// Extracts from an already-parsed document.
63    ///
64    /// # Errors
65    ///
66    /// Returns [`ExtractionError`] if a fatal error prevents extraction.
67    /// Most issues are captured as warnings in the returned output.
68    ///
69    /// # Panics
70    ///
71    /// Panics if the internal CSS selector constant fails to parse.
72    /// This is a compile-time-verified string and will never fail.
73    pub fn extract_from_document(
74        &self,
75        document: &Html,
76    ) -> Result<ExtractionOutput, ExtractionError> {
77        // Find top-level items: elements with itemscope but NOT itemprop
78        // (itemprop + itemscope = nested item, handled during parent traversal)
79        static SELECTOR: OnceLock<Selector> = OnceLock::new();
80        let selector = SELECTOR.get_or_init(|| {
81            Selector::parse("[itemscope]").expect("static selector '[itemscope]' must parse")
82        });
83
84        let mut warnings = Vec::new();
85        let mut nodes = Vec::new();
86
87        for element in document.select(selector) {
88            // Skip nested items: those with itemprop are handled by their parent
89            if element.value().attr("itemprop").is_some() {
90                continue;
91            }
92
93            match extract_item(&element, document, &mut warnings, 0) {
94                Some(node) => nodes.push(node),
95                None => {
96                    warnings.push(ExtractionWarning {
97                        message: "failed to extract Microdata item".into(),
98                        source_location: None,
99                        code: WarningCode::MalformedMicrodata,
100                    });
101                }
102            }
103        }
104
105        Ok(ExtractionOutput { nodes, warnings })
106    }
107}
108
109/// Extract a single Microdata item from an element with `itemscope`.
110fn extract_item(
111    element: &scraper::ElementRef<'_>,
112    document: &Html,
113    warnings: &mut Vec<ExtractionWarning>,
114    depth: usize,
115) -> Option<SchemaNode> {
116    if depth > MAX_DEPTH {
117        warnings.push(ExtractionWarning {
118            message: format!("Microdata nesting depth exceeds {MAX_DEPTH}, skipping"),
119            source_location: None,
120            code: WarningCode::MalformedMicrodata,
121        });
122        return None;
123    }
124
125    let el = element.value();
126
127    // Extract itemtype -> types
128    let types = extract_itemtypes(el);
129
130    // Build properties from itemprop descendants + itemref targets
131    let mut properties: IndexMap<String, Vec<SchemaValue>> = IndexMap::new();
132
133    // Store itemid as @id
134    if let Some(item_id) = el.attr("itemid") {
135        properties
136            .entry("@id".into())
137            .or_default()
138            .push(classify_text_value(item_id));
139    }
140
141    // Collect itemprop elements from the subtree
142    collect_properties(element, document, warnings, &mut properties, depth);
143
144    // Handle itemref: collect properties from referenced elements
145    if let Some(refs) = el.attr("itemref") {
146        for ref_id in refs.split_whitespace() {
147            // Use direct DOM traversal instead of CSS selectors to avoid
148            // selector injection with special characters in IDs.
149            match find_element_by_id(document, ref_id) {
150                Some(ref_element) => {
151                    if ref_element.value().attr("itemprop").is_some() {
152                        extract_prop_value(
153                            &ref_element,
154                            document,
155                            warnings,
156                            &mut properties,
157                            depth,
158                        );
159                    } else {
160                        collect_properties(
161                            &ref_element,
162                            document,
163                            warnings,
164                            &mut properties,
165                            depth,
166                        );
167                    }
168                }
169                None => {
170                    warnings.push(ExtractionWarning {
171                        message: format!("itemref target not found: #{ref_id}"),
172                        source_location: None,
173                        code: WarningCode::UnresolvableReference,
174                    });
175                }
176            }
177        }
178    }
179
180    if types.is_empty() && properties.is_empty() {
181        return None;
182    }
183
184    if types.is_empty() {
185        warnings.push(ExtractionWarning {
186            message: "Microdata item has itemscope but no itemtype".into(),
187            source_location: None,
188            code: WarningCode::EmptyType,
189        });
190    }
191
192    Some(SchemaNode {
193        types,
194        properties,
195        source_format: SourceFormat::Microdata,
196        source_location: None,
197    })
198}
199
200/// Collects `itemprop` elements from the subtree of a given element.
201///
202/// Walks the immediate children; for each child with `itemprop`, extracts
203/// the property. For children without `itemprop` that are NOT a new
204/// `itemscope`, recurse into their subtree to find deeper `itemprop` elements.
205fn collect_properties(
206    element: &scraper::ElementRef<'_>,
207    document: &Html,
208    warnings: &mut Vec<ExtractionWarning>,
209    properties: &mut IndexMap<String, Vec<SchemaValue>>,
210    depth: usize,
211) {
212    for child in element.children() {
213        visit_for_properties(child, document, warnings, properties, depth);
214    }
215}
216
217/// Recursively visits a node looking for `itemprop` elements.
218fn visit_for_properties(
219    node: NodeRef<'_, Node>,
220    document: &Html,
221    warnings: &mut Vec<ExtractionWarning>,
222    properties: &mut IndexMap<String, Vec<SchemaValue>>,
223    depth: usize,
224) {
225    if let Some(el) = node.value().as_element() {
226        let Some(elem_ref) = scraper::ElementRef::wrap(node) else {
227            return;
228        };
229
230        if el.attr("itemprop").is_some() {
231            // This is a property - extract its value
232            extract_prop_value(&elem_ref, document, warnings, properties, depth);
233            return; // Don't recurse further - the property owns its subtree
234        }
235
236        // If this is a new itemscope WITHOUT itemprop, it's a separate top-level item.
237        // Don't traverse into it - it's handled by the top-level loop.
238        if el.attr("itemscope").is_some() {
239            return;
240        }
241    }
242
243    // Not an itemprop, not a new itemscope - recurse into children
244    for child in node.children() {
245        visit_for_properties(child, document, warnings, properties, depth);
246    }
247}
248
249/// Extracts one or more property values from an `itemprop` element.
250///
251/// Handles space-separated `itemprop` names (one element -> multiple properties).
252fn extract_prop_value(
253    element: &scraper::ElementRef<'_>,
254    document: &Html,
255    warnings: &mut Vec<ExtractionWarning>,
256    properties: &mut IndexMap<String, Vec<SchemaValue>>,
257    depth: usize,
258) {
259    let el = element.value();
260    let prop_names: Vec<&str> = el
261        .attr("itemprop")
262        .unwrap_or("")
263        .split_whitespace()
264        .collect();
265
266    if prop_names.is_empty() {
267        return;
268    }
269
270    let value = extract_element_value(element, document, warnings, depth);
271
272    for name in prop_names {
273        properties
274            .entry(name.to_string())
275            .or_default()
276            .push(value.clone());
277    }
278}
279
280/// Extracts the value from an element based on its tag name.
281///
282/// Follows the W3C Microdata extraction rules:
283/// - `<meta>` -> `content` attribute
284/// - `<a>`, `<link>`, `<area>` -> `href` attribute (URL)
285/// - `<img>`, `<audio>`, `<video>`, `<source>` -> `src` attribute (URL)
286/// - `<time>` -> `datetime` attribute
287/// - `<data>` -> `value` attribute
288/// - `<meter>` -> `value` attribute (Number)
289/// - Element with `itemscope` -> nested node
290/// - Everything else -> text content
291fn extract_element_value(
292    element: &scraper::ElementRef<'_>,
293    document: &Html,
294    warnings: &mut Vec<ExtractionWarning>,
295    depth: usize,
296) -> SchemaValue {
297    let el = element.value();
298    let tag = el.name();
299
300    // Nested item scope
301    if el.attr("itemscope").is_some() {
302        return match extract_item(element, document, warnings, depth + 1) {
303            Some(node) => SchemaValue::Node(Box::new(node)),
304            None => SchemaValue::Text(String::new()),
305        };
306    }
307
308    match tag {
309        "meta" => {
310            let content = el.attr("content").unwrap_or("");
311            classify_text_value(content)
312        }
313        "a" | "link" | "area" => {
314            let href = el.attr("href").unwrap_or("");
315            if href.is_empty() {
316                SchemaValue::Text(element.text().collect::<String>().trim().to_string())
317            } else {
318                SchemaValue::Url(href.to_string())
319            }
320        }
321        "img" | "audio" | "video" | "source" | "embed" => {
322            let src = el.attr("src").unwrap_or("");
323            if src.is_empty() {
324                SchemaValue::Text(String::new())
325            } else {
326                SchemaValue::Url(src.to_string())
327            }
328        }
329        "object" => {
330            let data = el.attr("data").unwrap_or("");
331            if data.is_empty() {
332                SchemaValue::Text(String::new())
333            } else {
334                SchemaValue::Url(data.to_string())
335            }
336        }
337        "time" => {
338            let datetime = el.attr("datetime").unwrap_or("");
339            if datetime.is_empty() {
340                SchemaValue::Text(element.text().collect::<String>().trim().to_string())
341            } else {
342                SchemaValue::DateTime(datetime.to_string())
343            }
344        }
345        "data" => {
346            let val = el.attr("value").unwrap_or("");
347            if val.is_empty() {
348                SchemaValue::Text(element.text().collect::<String>().trim().to_string())
349            } else {
350                classify_text_value(val)
351            }
352        }
353        "meter" => {
354            let val = el.attr("value").unwrap_or("");
355            match val.parse::<f64>() {
356                Ok(n) => SchemaValue::Number(n),
357                Err(_) => SchemaValue::Text(val.to_string()),
358            }
359        }
360        _ => {
361            let text = element.text().collect::<String>();
362            let trimmed = text.trim().to_string();
363            classify_text_value(&trimmed)
364        }
365    }
366}
367
368/// Extracts `itemtype` values, stripping `schema.org` prefixes.
369fn extract_itemtypes(el: &scraper::node::Element) -> Vec<String> {
370    el.attr("itemtype")
371        .map(|types| {
372            types
373                .split_whitespace()
374                .map(|s| strip_schema_prefix(s).into_owned())
375                .collect()
376        })
377        .unwrap_or_default()
378}
379
380/// Finds an element by its `id` attribute using direct DOM traversal.
381///
382/// Uses attribute comparison instead of CSS selectors to correctly handle
383/// IDs containing special characters (dots, colons, brackets).
384fn find_element_by_id<'a>(document: &'a Html, id: &str) -> Option<scraper::ElementRef<'a>> {
385    document
386        .tree
387        .root()
388        .descendants()
389        .filter_map(scraper::ElementRef::wrap)
390        .find(|el| el.value().id() == Some(id))
391}
392
393#[cfg(test)]
394mod tests {
395    use pretty_assertions::assert_eq;
396
397    use super::*;
398
399    #[test]
400    fn basic_product() {
401        let html = r#"<html><body>
402<div itemscope itemtype="https://schema.org/Product">
403  <span itemprop="name">Widget</span>
404  <span itemprop="description">A great widget</span>
405</div>
406</body></html>"#;
407
408        let out = MicrodataExtractor.extract(html).expect("extraction failed");
409        assert_eq!(out.nodes.len(), 1);
410        assert_eq!(out.nodes[0].types, vec!["Product"]);
411        assert_eq!(out.nodes[0].source_format, SourceFormat::Microdata);
412        assert_eq!(
413            out.nodes[0].properties["name"],
414            vec![SchemaValue::Text("Widget".into())]
415        );
416        assert_eq!(
417            out.nodes[0].properties["description"],
418            vec![SchemaValue::Text("A great widget".into())]
419        );
420    }
421
422    #[test]
423    fn nested_offer() {
424        let html = r#"<html><body>
425<div itemscope itemtype="https://schema.org/Product">
426  <span itemprop="name">Widget</span>
427  <div itemprop="offers" itemscope itemtype="https://schema.org/Offer">
428    <span itemprop="priceCurrency">USD</span>
429    <meta itemprop="price" content="29.99">
430  </div>
431</div>
432</body></html>"#;
433
434        let out = MicrodataExtractor.extract(html).expect("extraction failed");
435        assert_eq!(out.nodes.len(), 1);
436        let offers = &out.nodes[0].properties["offers"];
437        assert_eq!(offers.len(), 1);
438        if let SchemaValue::Node(offer) = &offers[0] {
439            assert_eq!(offer.types, vec!["Offer"]);
440            assert_eq!(
441                offer.properties["priceCurrency"],
442                vec![SchemaValue::Text("USD".into())]
443            );
444            assert_eq!(
445                offer.properties["price"],
446                vec![SchemaValue::Text("29.99".into())]
447            );
448        } else {
449            panic!("Expected nested Node for offers");
450        }
451    }
452
453    #[test]
454    fn meta_content() {
455        let html = r#"<html><body>
456<div itemscope itemtype="https://schema.org/Product">
457  <meta itemprop="name" content="Invisible Widget">
458</div>
459</body></html>"#;
460
461        let out = MicrodataExtractor.extract(html).expect("extraction failed");
462        assert_eq!(
463            out.nodes[0].properties["name"],
464            vec![SchemaValue::Text("Invisible Widget".into())]
465        );
466    }
467
468    #[test]
469    fn link_href_as_url() {
470        let html = r#"<html><body>
471<div itemscope itemtype="https://schema.org/Product">
472  <span itemprop="name">Widget</span>
473  <a itemprop="url" href="https://example.com/widget">Link</a>
474</div>
475</body></html>"#;
476
477        let out = MicrodataExtractor.extract(html).expect("extraction failed");
478        assert_eq!(
479            out.nodes[0].properties["url"],
480            vec![SchemaValue::Url("https://example.com/widget".into())]
481        );
482    }
483
484    #[test]
485    fn img_src_as_url() {
486        let html = r#"<html><body>
487<div itemscope itemtype="https://schema.org/Product">
488  <span itemprop="name">Widget</span>
489  <img itemprop="image" src="https://example.com/img.jpg">
490</div>
491</body></html>"#;
492
493        let out = MicrodataExtractor.extract(html).expect("extraction failed");
494        assert_eq!(
495            out.nodes[0].properties["image"],
496            vec![SchemaValue::Url("https://example.com/img.jpg".into())]
497        );
498    }
499
500    #[test]
501    fn time_datetime() {
502        let html = r#"<html><body>
503<div itemscope itemtype="https://schema.org/Event">
504  <span itemprop="name">Concert</span>
505  <time itemprop="startDate" datetime="2024-06-15T19:00:00">June 15</time>
506</div>
507</body></html>"#;
508
509        let out = MicrodataExtractor.extract(html).expect("extraction failed");
510        assert_eq!(
511            out.nodes[0].properties["startDate"],
512            vec![SchemaValue::DateTime("2024-06-15T19:00:00".into())]
513        );
514    }
515
516    #[test]
517    fn meter_value_as_number() {
518        let html = r#"<html><body>
519<div itemscope itemtype="https://schema.org/Product">
520  <span itemprop="name">Widget</span>
521  <meter itemprop="ratingValue" value="4.5" min="0" max="5">4.5 stars</meter>
522</div>
523</body></html>"#;
524
525        let out = MicrodataExtractor.extract(html).expect("extraction failed");
526        assert_eq!(
527            out.nodes[0].properties["ratingValue"],
528            vec![SchemaValue::Number(4.5)]
529        );
530    }
531
532    #[test]
533    fn data_value_attribute() {
534        let html = r#"<html><body>
535<div itemscope itemtype="https://schema.org/Product">
536  <data itemprop="sku" value="12345">Product SKU</data>
537</div>
538</body></html>"#;
539
540        let out = MicrodataExtractor.extract(html).expect("extraction failed");
541        assert_eq!(
542            out.nodes[0].properties["sku"],
543            vec![SchemaValue::Text("12345".into())]
544        );
545    }
546
547    #[test]
548    fn space_separated_itemprop() {
549        let html = r#"<html><body>
550<div itemscope itemtype="https://schema.org/Product">
551  <span itemprop="name alternateName">Widget</span>
552</div>
553</body></html>"#;
554
555        let out = MicrodataExtractor.extract(html).expect("extraction failed");
556        assert_eq!(
557            out.nodes[0].properties["name"],
558            vec![SchemaValue::Text("Widget".into())]
559        );
560        assert_eq!(
561            out.nodes[0].properties["alternateName"],
562            vec![SchemaValue::Text("Widget".into())]
563        );
564    }
565
566    #[test]
567    fn multiple_values_same_property() {
568        let html = r#"<html><body>
569<div itemscope itemtype="https://schema.org/Product">
570  <span itemprop="name">Widget</span>
571  <img itemprop="image" src="https://example.com/img1.jpg">
572  <img itemprop="image" src="https://example.com/img2.jpg">
573</div>
574</body></html>"#;
575
576        let out = MicrodataExtractor.extract(html).expect("extraction failed");
577        assert_eq!(out.nodes[0].properties["image"].len(), 2);
578    }
579
580    #[test]
581    fn itemid_becomes_at_id() {
582        let html = r#"<html><body>
583<div itemscope itemtype="https://schema.org/Product" itemid="https://example.com/product/123">
584  <span itemprop="name">Widget</span>
585</div>
586</body></html>"#;
587
588        let out = MicrodataExtractor.extract(html).expect("extraction failed");
589        assert_eq!(
590            out.nodes[0].properties["@id"],
591            vec![SchemaValue::Url("https://example.com/product/123".into())]
592        );
593    }
594
595    #[test]
596    fn itemref_collects_external_properties() {
597        let html = r#"<html><body>
598<div itemscope itemtype="https://schema.org/Product" itemref="desc-block">
599  <span itemprop="name">Widget</span>
600</div>
601<div id="desc-block">
602  <span itemprop="description">A fine product</span>
603</div>
604</body></html>"#;
605
606        let out = MicrodataExtractor.extract(html).expect("extraction failed");
607        assert_eq!(out.nodes.len(), 1);
608        assert_eq!(
609            out.nodes[0].properties["description"],
610            vec![SchemaValue::Text("A fine product".into())]
611        );
612    }
613
614    #[test]
615    fn itemref_missing_target_warns() {
616        let html = r#"<html><body>
617<div itemscope itemtype="https://schema.org/Product" itemref="nonexistent">
618  <span itemprop="name">Widget</span>
619</div>
620</body></html>"#;
621
622        let out = MicrodataExtractor.extract(html).expect("extraction failed");
623        assert!(out
624            .warnings
625            .iter()
626            .any(|w| w.code == WarningCode::UnresolvableReference));
627    }
628
629    #[test]
630    fn multiple_itemtypes() {
631        let html = r#"<html><body>
632<div itemscope itemtype="https://schema.org/Product https://schema.org/IndividualProduct">
633  <span itemprop="name">Widget</span>
634</div>
635</body></html>"#;
636
637        let out = MicrodataExtractor.extract(html).expect("extraction failed");
638        assert_eq!(out.nodes[0].types, vec!["Product", "IndividualProduct"]);
639    }
640
641    #[test]
642    fn http_prefix_stripped() {
643        let html = r#"<html><body>
644<div itemscope itemtype="http://schema.org/Product">
645  <span itemprop="name">Widget</span>
646</div>
647</body></html>"#;
648
649        let out = MicrodataExtractor.extract(html).expect("extraction failed");
650        assert_eq!(out.nodes[0].types, vec!["Product"]);
651    }
652
653    #[test]
654    fn deeply_nested_scopes() {
655        let html = r#"<html><body>
656<div itemscope itemtype="https://schema.org/Product">
657  <span itemprop="name">Widget</span>
658  <div itemprop="offers" itemscope itemtype="https://schema.org/Offer">
659    <meta itemprop="price" content="29.99">
660    <div itemprop="seller" itemscope itemtype="https://schema.org/Organization">
661      <span itemprop="name">Acme</span>
662      <div itemprop="address" itemscope itemtype="https://schema.org/PostalAddress">
663        <span itemprop="addressCountry">US</span>
664      </div>
665    </div>
666  </div>
667</div>
668</body></html>"#;
669
670        let out = MicrodataExtractor.extract(html).expect("extraction failed");
671        assert_eq!(out.nodes.len(), 1);
672        if let SchemaValue::Node(offer) = &out.nodes[0].properties["offers"][0] {
673            if let SchemaValue::Node(seller) = &offer.properties["seller"][0] {
674                if let SchemaValue::Node(addr) = &seller.properties["address"][0] {
675                    assert_eq!(addr.types, vec!["PostalAddress"]);
676                    assert_eq!(
677                        addr.properties["addressCountry"],
678                        vec![SchemaValue::Text("US".into())]
679                    );
680                } else {
681                    panic!("Expected PostalAddress node");
682                }
683            } else {
684                panic!("Expected Organization node");
685            }
686        } else {
687            panic!("Expected Offer node");
688        }
689    }
690
691    #[test]
692    fn multiple_top_level_items() {
693        let html = r#"<html><body>
694<div itemscope itemtype="https://schema.org/Product">
695  <span itemprop="name">Widget A</span>
696</div>
697<div itemscope itemtype="https://schema.org/Product">
698  <span itemprop="name">Widget B</span>
699</div>
700</body></html>"#;
701
702        let out = MicrodataExtractor.extract(html).expect("extraction failed");
703        assert_eq!(out.nodes.len(), 2);
704        assert_eq!(
705            out.nodes[0].properties["name"],
706            vec![SchemaValue::Text("Widget A".into())]
707        );
708        assert_eq!(
709            out.nodes[1].properties["name"],
710            vec![SchemaValue::Text("Widget B".into())]
711        );
712    }
713
714    #[test]
715    fn no_microdata() {
716        let html = "<html><body><p>No microdata here</p></body></html>";
717        let out = MicrodataExtractor.extract(html).expect("extraction failed");
718        assert!(out.nodes.is_empty());
719        assert!(out.warnings.is_empty());
720    }
721
722    #[test]
723    fn itemscope_without_itemtype_warns() {
724        let html = r#"<html><body>
725<div itemscope>
726  <span itemprop="name">Something</span>
727</div>
728</body></html>"#;
729
730        let out = MicrodataExtractor.extract(html).expect("extraction failed");
731        assert_eq!(out.nodes.len(), 1);
732        assert!(out.nodes[0].types.is_empty());
733        assert!(out
734            .warnings
735            .iter()
736            .any(|w| w.code == WarningCode::EmptyType));
737    }
738
739    #[test]
740    fn itemprop_in_wrapper_div() {
741        // itemprop elements inside non-itemscope wrapper divs
742        let html = r#"<html><body>
743<div itemscope itemtype="https://schema.org/Product">
744  <div class="wrapper">
745    <div class="inner">
746      <span itemprop="name">Widget</span>
747    </div>
748  </div>
749</div>
750</body></html>"#;
751
752        let out = MicrodataExtractor.extract(html).expect("extraction failed");
753        assert_eq!(out.nodes.len(), 1);
754        assert_eq!(
755            out.nodes[0].properties["name"],
756            vec![SchemaValue::Text("Widget".into())]
757        );
758    }
759
760    #[test]
761    fn time_without_datetime_uses_text() {
762        let html = r#"<html><body>
763<div itemscope itemtype="https://schema.org/Event">
764  <time itemprop="startDate">June 15, 2024</time>
765</div>
766</body></html>"#;
767
768        let out = MicrodataExtractor.extract(html).expect("extraction failed");
769        assert_eq!(
770            out.nodes[0].properties["startDate"],
771            vec![SchemaValue::Text("June 15, 2024".into())]
772        );
773    }
774
775    #[test]
776    fn link_without_href_uses_text() {
777        let html = r#"<html><body>
778<div itemscope itemtype="https://schema.org/Product">
779  <a itemprop="url">Click here</a>
780</div>
781</body></html>"#;
782
783        let out = MicrodataExtractor.extract(html).expect("extraction failed");
784        assert_eq!(
785            out.nodes[0].properties["url"],
786            vec![SchemaValue::Text("Click here".into())]
787        );
788    }
789
790    #[test]
791    fn circular_itemref_does_not_loop() {
792        let html = r#"<html><body>
793<div id="a" itemscope itemtype="https://schema.org/Product" itemref="b">
794  <span itemprop="name">Product A</span>
795</div>
796<div id="b">
797  <span itemprop="description">Desc from B</span>
798</div>
799</body></html>"#;
800
801        let out = MicrodataExtractor.extract(html).expect("must not hang");
802        assert_eq!(out.nodes.len(), 1);
803        assert_eq!(
804            out.nodes[0].properties["description"],
805            vec![SchemaValue::Text("Desc from B".into())]
806        );
807    }
808
809    #[test]
810    fn self_referencing_itemref() {
811        // An item references its own id
812        let html = r#"<html><body>
813<div id="self" itemscope itemtype="https://schema.org/Product" itemref="self">
814  <span itemprop="name">Widget</span>
815</div>
816</body></html>"#;
817
818        // Should not infinite-loop. The element itself already has its
819        // properties collected, so re-visiting should not duplicate.
820        let out = MicrodataExtractor.extract(html).expect("must not hang");
821        assert_eq!(out.nodes.len(), 1);
822    }
823
824    #[test]
825    fn itemref_multiple_ids() {
826        let html = r#"<html><body>
827<div itemscope itemtype="https://schema.org/Product" itemref="desc-block price-block">
828  <span itemprop="name">Widget</span>
829</div>
830<div id="desc-block">
831  <span itemprop="description">A fine widget</span>
832</div>
833<div id="price-block">
834  <meta itemprop="price" content="29.99">
835</div>
836</body></html>"#;
837
838        let out = MicrodataExtractor.extract(html).expect("extraction failed");
839        assert_eq!(out.nodes.len(), 1);
840        assert_eq!(
841            out.nodes[0].properties["description"],
842            vec![SchemaValue::Text("A fine widget".into())]
843        );
844        assert_eq!(
845            out.nodes[0].properties["price"],
846            vec![SchemaValue::Text("29.99".into())]
847        );
848    }
849
850    #[test]
851    fn empty_itemprop_attribute_skipped() {
852        let html = r#"<html><body>
853<div itemscope itemtype="https://schema.org/Product">
854  <span itemprop="">should be skipped</span>
855  <span itemprop="name">Widget</span>
856</div>
857</body></html>"#;
858
859        let out = MicrodataExtractor.extract(html).expect("extraction failed");
860        assert_eq!(out.nodes.len(), 1);
861        // Empty itemprop should not create a property with empty-string key
862        assert!(!out.nodes[0].properties.contains_key(""));
863        assert_eq!(
864            out.nodes[0].properties["name"],
865            vec![SchemaValue::Text("Widget".into())]
866        );
867    }
868
869    #[test]
870    fn object_element_data_attribute() {
871        let html = r#"<html><body>
872<div itemscope itemtype="https://schema.org/Product">
873  <span itemprop="name">Widget</span>
874  <object itemprop="image" data="https://example.com/widget.swf">fallback</object>
875</div>
876</body></html>"#;
877
878        let out = MicrodataExtractor.extract(html).expect("extraction failed");
879        assert_eq!(
880            out.nodes[0].properties["image"],
881            vec![SchemaValue::Url("https://example.com/widget.swf".into())]
882        );
883    }
884
885    #[test]
886    fn embed_element_src_attribute() {
887        let html = r#"<html><body>
888<div itemscope itemtype="https://schema.org/Product">
889  <span itemprop="name">Widget</span>
890  <embed itemprop="video" src="https://example.com/demo.mp4">
891</div>
892</body></html>"#;
893
894        let out = MicrodataExtractor.extract(html).expect("extraction failed");
895        assert_eq!(
896            out.nodes[0].properties["video"],
897            vec![SchemaValue::Url("https://example.com/demo.mp4".into())]
898        );
899    }
900
901    #[test]
902    fn source_element_src_attribute() {
903        let html = r#"<html><body>
904<div itemscope itemtype="https://schema.org/Product">
905  <span itemprop="name">Widget</span>
906  <source itemprop="audio" src="https://example.com/sound.mp3">
907</div>
908</body></html>"#;
909
910        let out = MicrodataExtractor.extract(html).expect("extraction failed");
911        assert_eq!(
912            out.nodes[0].properties["audio"],
913            vec![SchemaValue::Url("https://example.com/sound.mp3".into())]
914        );
915    }
916
917    #[test]
918    fn depth_exceeding_max_warns() {
919        // Build HTML with MAX_DEPTH + 2 nested itemscopes
920        let mut html = String::from("<html><body>");
921        let target = MAX_DEPTH + 2;
922        for i in 0..target {
923            html.push_str(&format!(
924                r#"<div itemprop="child" itemscope "#,
925            ));
926            html.push_str(&format!(
927                r#"itemtype="https://schema.org/Thing">"#,
928            ));
929            html.push_str(&format!(
930                r#"<span itemprop="name">L{i}</span>"#,
931            ));
932        }
933        for _ in 0..target {
934            html.push_str("</div>");
935        }
936        html.push_str("</body></html>");
937
938        // Remove itemprop from the outermost to make it a top-level item
939        let html = html.replacen(r#"itemprop="child" "#, "", 1);
940
941        let out = MicrodataExtractor
942            .extract(&html)
943            .expect("extraction failed");
944        assert!(
945            out.warnings
946                .iter()
947                .any(|w| w.message.contains("depth") || w.message.contains("Microdata")),
948            "should warn when exceeding MAX_DEPTH"
949        );
950    }
951
952    #[test]
953    fn empty_itemtype_attribute() {
954        let html = r#"<html><body>
955<div itemscope itemtype="">
956  <span itemprop="name">Something</span>
957</div>
958</body></html>"#;
959
960        let out = MicrodataExtractor.extract(html).expect("extraction failed");
961        assert_eq!(out.nodes.len(), 1);
962        assert!(out.nodes[0].types.is_empty());
963        assert!(out
964            .warnings
965            .iter()
966            .any(|w| w.code == WarningCode::EmptyType));
967    }
968
969    #[test]
970    fn meter_non_numeric_value_fallback() {
971        let html = r#"<html><body>
972<div itemscope itemtype="https://schema.org/Product">
973  <span itemprop="name">Widget</span>
974  <meter itemprop="score" value="not-a-number">High</meter>
975</div>
976</body></html>"#;
977
978        let out = MicrodataExtractor.extract(html).expect("extraction failed");
979        assert_eq!(
980            out.nodes[0].properties["score"],
981            vec![SchemaValue::Text("not-a-number".into())]
982        );
983    }
984
985    #[test]
986    fn img_empty_src_gives_empty_text() {
987        let html = r#"<html><body>
988<div itemscope itemtype="https://schema.org/Product">
989  <span itemprop="name">Widget</span>
990  <img itemprop="image" src="">
991</div>
992</body></html>"#;
993
994        let out = MicrodataExtractor.extract(html).expect("extraction failed");
995        assert_eq!(
996            out.nodes[0].properties["image"],
997            vec![SchemaValue::Text(String::new())]
998        );
999    }
1000
1001    #[test]
1002    fn itemref_to_element_with_itemprop() {
1003        // The referenced element itself has itemprop, so it should be
1004        // extracted as a property directly
1005        let html = r#"<html><body>
1006<div itemscope itemtype="https://schema.org/Product" itemref="ext-name">
1007  <span itemprop="description">A fine widget</span>
1008</div>
1009<span id="ext-name" itemprop="name">Widget</span>
1010</body></html>"#;
1011
1012        let out = MicrodataExtractor.extract(html).expect("extraction failed");
1013        assert_eq!(out.nodes.len(), 1);
1014        assert_eq!(
1015            out.nodes[0].properties["name"],
1016            vec![SchemaValue::Text("Widget".into())]
1017        );
1018    }
1019
1020    #[test]
1021    fn unicode_preserved_in_values() {
1022        let html = r#"<html><body>
1023<div itemscope itemtype="https://schema.org/Product">
1024  <span itemprop="name">Gerät für Ökologie</span>
1025</div>
1026</body></html>"#;
1027
1028        let out = MicrodataExtractor.extract(html).expect("extraction failed");
1029        assert_eq!(out.nodes.len(), 1);
1030        assert_eq!(
1031            out.nodes[0].properties["name"],
1032            vec![SchemaValue::Text("Gerät für Ökologie".into())]
1033        );
1034    }
1035
1036    #[test]
1037    fn object_empty_data_gives_empty_text() {
1038        let html = r#"<html><body>
1039<div itemscope itemtype="https://schema.org/Product">
1040  <span itemprop="name">Widget</span>
1041  <object itemprop="image" data="">fallback</object>
1042</div>
1043</body></html>"#;
1044
1045        let out = MicrodataExtractor.extract(html).expect("extraction failed");
1046        assert_eq!(
1047            out.nodes[0].properties["image"],
1048            vec![SchemaValue::Text(String::new())]
1049        );
1050    }
1051}