1use std::sync::OnceLock;
22
23use ego_tree::NodeRef;
24use indexmap::IndexMap;
25use scraper::node::Node;
26use scraper::{Html, Selector};
27
28use crate::error::{ExtractionError, ExtractionWarning, WarningCode};
29use crate::types::{SchemaNode, SchemaValue, SourceFormat};
30
31use super::{classify_text_value, strip_schema_prefix, ExtractionOutput, Extractor};
32
33const MAX_DEPTH: usize = 20;
35
36pub struct MicrodataExtractor;
53
54impl Extractor for MicrodataExtractor {
55 fn extract(&self, html: &str) -> Result<ExtractionOutput, ExtractionError> {
56 let document = Html::parse_document(html);
57 self.extract_from_document(&document)
58 }
59}
60
61impl MicrodataExtractor {
62 pub fn extract_from_document(
74 &self,
75 document: &Html,
76 ) -> Result<ExtractionOutput, ExtractionError> {
77 static SELECTOR: OnceLock<Selector> = OnceLock::new();
80 let selector = SELECTOR.get_or_init(|| {
81 Selector::parse("[itemscope]").expect("static selector '[itemscope]' must parse")
82 });
83
84 let mut warnings = Vec::new();
85 let mut nodes = Vec::new();
86
87 for element in document.select(selector) {
88 if element.value().attr("itemprop").is_some() {
90 continue;
91 }
92
93 match extract_item(&element, document, &mut warnings, 0) {
94 Some(node) => nodes.push(node),
95 None => {
96 warnings.push(ExtractionWarning {
97 message: "failed to extract Microdata item".into(),
98 source_location: None,
99 code: WarningCode::MalformedMicrodata,
100 });
101 }
102 }
103 }
104
105 Ok(ExtractionOutput { nodes, warnings })
106 }
107}
108
109fn extract_item(
111 element: &scraper::ElementRef<'_>,
112 document: &Html,
113 warnings: &mut Vec<ExtractionWarning>,
114 depth: usize,
115) -> Option<SchemaNode> {
116 if depth > MAX_DEPTH {
117 warnings.push(ExtractionWarning {
118 message: format!("Microdata nesting depth exceeds {MAX_DEPTH}, skipping"),
119 source_location: None,
120 code: WarningCode::MalformedMicrodata,
121 });
122 return None;
123 }
124
125 let el = element.value();
126
127 let types = extract_itemtypes(el);
129
130 let mut properties: IndexMap<String, Vec<SchemaValue>> = IndexMap::new();
132
133 if let Some(item_id) = el.attr("itemid") {
135 properties
136 .entry("@id".into())
137 .or_default()
138 .push(classify_text_value(item_id));
139 }
140
141 collect_properties(element, document, warnings, &mut properties, depth);
143
144 if let Some(refs) = el.attr("itemref") {
146 for ref_id in refs.split_whitespace() {
147 match find_element_by_id(document, ref_id) {
150 Some(ref_element) => {
151 if ref_element.value().attr("itemprop").is_some() {
152 extract_prop_value(
153 &ref_element,
154 document,
155 warnings,
156 &mut properties,
157 depth,
158 );
159 } else {
160 collect_properties(
161 &ref_element,
162 document,
163 warnings,
164 &mut properties,
165 depth,
166 );
167 }
168 }
169 None => {
170 warnings.push(ExtractionWarning {
171 message: format!("itemref target not found: #{ref_id}"),
172 source_location: None,
173 code: WarningCode::UnresolvableReference,
174 });
175 }
176 }
177 }
178 }
179
180 if types.is_empty() && properties.is_empty() {
181 return None;
182 }
183
184 if types.is_empty() {
185 warnings.push(ExtractionWarning {
186 message: "Microdata item has itemscope but no itemtype".into(),
187 source_location: None,
188 code: WarningCode::EmptyType,
189 });
190 }
191
192 Some(SchemaNode {
193 types,
194 properties,
195 source_format: SourceFormat::Microdata,
196 source_location: None,
197 })
198}
199
200fn collect_properties(
206 element: &scraper::ElementRef<'_>,
207 document: &Html,
208 warnings: &mut Vec<ExtractionWarning>,
209 properties: &mut IndexMap<String, Vec<SchemaValue>>,
210 depth: usize,
211) {
212 for child in element.children() {
213 visit_for_properties(child, document, warnings, properties, depth);
214 }
215}
216
217fn visit_for_properties(
219 node: NodeRef<'_, Node>,
220 document: &Html,
221 warnings: &mut Vec<ExtractionWarning>,
222 properties: &mut IndexMap<String, Vec<SchemaValue>>,
223 depth: usize,
224) {
225 if let Some(el) = node.value().as_element() {
226 let Some(elem_ref) = scraper::ElementRef::wrap(node) else {
227 return;
228 };
229
230 if el.attr("itemprop").is_some() {
231 extract_prop_value(&elem_ref, document, warnings, properties, depth);
233 return; }
235
236 if el.attr("itemscope").is_some() {
239 return;
240 }
241 }
242
243 for child in node.children() {
245 visit_for_properties(child, document, warnings, properties, depth);
246 }
247}
248
249fn extract_prop_value(
253 element: &scraper::ElementRef<'_>,
254 document: &Html,
255 warnings: &mut Vec<ExtractionWarning>,
256 properties: &mut IndexMap<String, Vec<SchemaValue>>,
257 depth: usize,
258) {
259 let el = element.value();
260 let prop_names: Vec<&str> = el
261 .attr("itemprop")
262 .unwrap_or("")
263 .split_whitespace()
264 .collect();
265
266 if prop_names.is_empty() {
267 return;
268 }
269
270 let value = extract_element_value(element, document, warnings, depth);
271
272 for name in prop_names {
273 properties
274 .entry(name.to_string())
275 .or_default()
276 .push(value.clone());
277 }
278}
279
280fn extract_element_value(
292 element: &scraper::ElementRef<'_>,
293 document: &Html,
294 warnings: &mut Vec<ExtractionWarning>,
295 depth: usize,
296) -> SchemaValue {
297 let el = element.value();
298 let tag = el.name();
299
300 if el.attr("itemscope").is_some() {
302 return match extract_item(element, document, warnings, depth + 1) {
303 Some(node) => SchemaValue::Node(Box::new(node)),
304 None => SchemaValue::Text(String::new()),
305 };
306 }
307
308 match tag {
309 "meta" => {
310 let content = el.attr("content").unwrap_or("");
311 classify_text_value(content)
312 }
313 "a" | "link" | "area" => {
314 let href = el.attr("href").unwrap_or("");
315 if href.is_empty() {
316 SchemaValue::Text(element.text().collect::<String>().trim().to_string())
317 } else {
318 SchemaValue::Url(href.to_string())
319 }
320 }
321 "img" | "audio" | "video" | "source" | "embed" => {
322 let src = el.attr("src").unwrap_or("");
323 if src.is_empty() {
324 SchemaValue::Text(String::new())
325 } else {
326 SchemaValue::Url(src.to_string())
327 }
328 }
329 "object" => {
330 let data = el.attr("data").unwrap_or("");
331 if data.is_empty() {
332 SchemaValue::Text(String::new())
333 } else {
334 SchemaValue::Url(data.to_string())
335 }
336 }
337 "time" => {
338 let datetime = el.attr("datetime").unwrap_or("");
339 if datetime.is_empty() {
340 SchemaValue::Text(element.text().collect::<String>().trim().to_string())
341 } else {
342 SchemaValue::DateTime(datetime.to_string())
343 }
344 }
345 "data" => {
346 let val = el.attr("value").unwrap_or("");
347 if val.is_empty() {
348 SchemaValue::Text(element.text().collect::<String>().trim().to_string())
349 } else {
350 classify_text_value(val)
351 }
352 }
353 "meter" => {
354 let val = el.attr("value").unwrap_or("");
355 match val.parse::<f64>() {
356 Ok(n) => SchemaValue::Number(n),
357 Err(_) => SchemaValue::Text(val.to_string()),
358 }
359 }
360 _ => {
361 let text = element.text().collect::<String>();
362 let trimmed = text.trim().to_string();
363 classify_text_value(&trimmed)
364 }
365 }
366}
367
368fn extract_itemtypes(el: &scraper::node::Element) -> Vec<String> {
370 el.attr("itemtype")
371 .map(|types| {
372 types
373 .split_whitespace()
374 .map(|s| strip_schema_prefix(s).into_owned())
375 .collect()
376 })
377 .unwrap_or_default()
378}
379
380fn find_element_by_id<'a>(document: &'a Html, id: &str) -> Option<scraper::ElementRef<'a>> {
385 document
386 .tree
387 .root()
388 .descendants()
389 .filter_map(scraper::ElementRef::wrap)
390 .find(|el| el.value().id() == Some(id))
391}
392
393#[cfg(test)]
394mod tests {
395 use pretty_assertions::assert_eq;
396
397 use super::*;
398
399 #[test]
400 fn basic_product() {
401 let html = r#"<html><body>
402<div itemscope itemtype="https://schema.org/Product">
403 <span itemprop="name">Widget</span>
404 <span itemprop="description">A great widget</span>
405</div>
406</body></html>"#;
407
408 let out = MicrodataExtractor.extract(html).expect("extraction failed");
409 assert_eq!(out.nodes.len(), 1);
410 assert_eq!(out.nodes[0].types, vec!["Product"]);
411 assert_eq!(out.nodes[0].source_format, SourceFormat::Microdata);
412 assert_eq!(
413 out.nodes[0].properties["name"],
414 vec![SchemaValue::Text("Widget".into())]
415 );
416 assert_eq!(
417 out.nodes[0].properties["description"],
418 vec![SchemaValue::Text("A great widget".into())]
419 );
420 }
421
422 #[test]
423 fn nested_offer() {
424 let html = r#"<html><body>
425<div itemscope itemtype="https://schema.org/Product">
426 <span itemprop="name">Widget</span>
427 <div itemprop="offers" itemscope itemtype="https://schema.org/Offer">
428 <span itemprop="priceCurrency">USD</span>
429 <meta itemprop="price" content="29.99">
430 </div>
431</div>
432</body></html>"#;
433
434 let out = MicrodataExtractor.extract(html).expect("extraction failed");
435 assert_eq!(out.nodes.len(), 1);
436 let offers = &out.nodes[0].properties["offers"];
437 assert_eq!(offers.len(), 1);
438 if let SchemaValue::Node(offer) = &offers[0] {
439 assert_eq!(offer.types, vec!["Offer"]);
440 assert_eq!(
441 offer.properties["priceCurrency"],
442 vec![SchemaValue::Text("USD".into())]
443 );
444 assert_eq!(
445 offer.properties["price"],
446 vec![SchemaValue::Text("29.99".into())]
447 );
448 } else {
449 panic!("Expected nested Node for offers");
450 }
451 }
452
453 #[test]
454 fn meta_content() {
455 let html = r#"<html><body>
456<div itemscope itemtype="https://schema.org/Product">
457 <meta itemprop="name" content="Invisible Widget">
458</div>
459</body></html>"#;
460
461 let out = MicrodataExtractor.extract(html).expect("extraction failed");
462 assert_eq!(
463 out.nodes[0].properties["name"],
464 vec![SchemaValue::Text("Invisible Widget".into())]
465 );
466 }
467
468 #[test]
469 fn link_href_as_url() {
470 let html = r#"<html><body>
471<div itemscope itemtype="https://schema.org/Product">
472 <span itemprop="name">Widget</span>
473 <a itemprop="url" href="https://example.com/widget">Link</a>
474</div>
475</body></html>"#;
476
477 let out = MicrodataExtractor.extract(html).expect("extraction failed");
478 assert_eq!(
479 out.nodes[0].properties["url"],
480 vec![SchemaValue::Url("https://example.com/widget".into())]
481 );
482 }
483
484 #[test]
485 fn img_src_as_url() {
486 let html = r#"<html><body>
487<div itemscope itemtype="https://schema.org/Product">
488 <span itemprop="name">Widget</span>
489 <img itemprop="image" src="https://example.com/img.jpg">
490</div>
491</body></html>"#;
492
493 let out = MicrodataExtractor.extract(html).expect("extraction failed");
494 assert_eq!(
495 out.nodes[0].properties["image"],
496 vec![SchemaValue::Url("https://example.com/img.jpg".into())]
497 );
498 }
499
500 #[test]
501 fn time_datetime() {
502 let html = r#"<html><body>
503<div itemscope itemtype="https://schema.org/Event">
504 <span itemprop="name">Concert</span>
505 <time itemprop="startDate" datetime="2024-06-15T19:00:00">June 15</time>
506</div>
507</body></html>"#;
508
509 let out = MicrodataExtractor.extract(html).expect("extraction failed");
510 assert_eq!(
511 out.nodes[0].properties["startDate"],
512 vec![SchemaValue::DateTime("2024-06-15T19:00:00".into())]
513 );
514 }
515
516 #[test]
517 fn meter_value_as_number() {
518 let html = r#"<html><body>
519<div itemscope itemtype="https://schema.org/Product">
520 <span itemprop="name">Widget</span>
521 <meter itemprop="ratingValue" value="4.5" min="0" max="5">4.5 stars</meter>
522</div>
523</body></html>"#;
524
525 let out = MicrodataExtractor.extract(html).expect("extraction failed");
526 assert_eq!(
527 out.nodes[0].properties["ratingValue"],
528 vec![SchemaValue::Number(4.5)]
529 );
530 }
531
532 #[test]
533 fn data_value_attribute() {
534 let html = r#"<html><body>
535<div itemscope itemtype="https://schema.org/Product">
536 <data itemprop="sku" value="12345">Product SKU</data>
537</div>
538</body></html>"#;
539
540 let out = MicrodataExtractor.extract(html).expect("extraction failed");
541 assert_eq!(
542 out.nodes[0].properties["sku"],
543 vec![SchemaValue::Text("12345".into())]
544 );
545 }
546
547 #[test]
548 fn space_separated_itemprop() {
549 let html = r#"<html><body>
550<div itemscope itemtype="https://schema.org/Product">
551 <span itemprop="name alternateName">Widget</span>
552</div>
553</body></html>"#;
554
555 let out = MicrodataExtractor.extract(html).expect("extraction failed");
556 assert_eq!(
557 out.nodes[0].properties["name"],
558 vec![SchemaValue::Text("Widget".into())]
559 );
560 assert_eq!(
561 out.nodes[0].properties["alternateName"],
562 vec![SchemaValue::Text("Widget".into())]
563 );
564 }
565
566 #[test]
567 fn multiple_values_same_property() {
568 let html = r#"<html><body>
569<div itemscope itemtype="https://schema.org/Product">
570 <span itemprop="name">Widget</span>
571 <img itemprop="image" src="https://example.com/img1.jpg">
572 <img itemprop="image" src="https://example.com/img2.jpg">
573</div>
574</body></html>"#;
575
576 let out = MicrodataExtractor.extract(html).expect("extraction failed");
577 assert_eq!(out.nodes[0].properties["image"].len(), 2);
578 }
579
580 #[test]
581 fn itemid_becomes_at_id() {
582 let html = r#"<html><body>
583<div itemscope itemtype="https://schema.org/Product" itemid="https://example.com/product/123">
584 <span itemprop="name">Widget</span>
585</div>
586</body></html>"#;
587
588 let out = MicrodataExtractor.extract(html).expect("extraction failed");
589 assert_eq!(
590 out.nodes[0].properties["@id"],
591 vec![SchemaValue::Url("https://example.com/product/123".into())]
592 );
593 }
594
595 #[test]
596 fn itemref_collects_external_properties() {
597 let html = r#"<html><body>
598<div itemscope itemtype="https://schema.org/Product" itemref="desc-block">
599 <span itemprop="name">Widget</span>
600</div>
601<div id="desc-block">
602 <span itemprop="description">A fine product</span>
603</div>
604</body></html>"#;
605
606 let out = MicrodataExtractor.extract(html).expect("extraction failed");
607 assert_eq!(out.nodes.len(), 1);
608 assert_eq!(
609 out.nodes[0].properties["description"],
610 vec![SchemaValue::Text("A fine product".into())]
611 );
612 }
613
614 #[test]
615 fn itemref_missing_target_warns() {
616 let html = r#"<html><body>
617<div itemscope itemtype="https://schema.org/Product" itemref="nonexistent">
618 <span itemprop="name">Widget</span>
619</div>
620</body></html>"#;
621
622 let out = MicrodataExtractor.extract(html).expect("extraction failed");
623 assert!(out
624 .warnings
625 .iter()
626 .any(|w| w.code == WarningCode::UnresolvableReference));
627 }
628
629 #[test]
630 fn multiple_itemtypes() {
631 let html = r#"<html><body>
632<div itemscope itemtype="https://schema.org/Product https://schema.org/IndividualProduct">
633 <span itemprop="name">Widget</span>
634</div>
635</body></html>"#;
636
637 let out = MicrodataExtractor.extract(html).expect("extraction failed");
638 assert_eq!(out.nodes[0].types, vec!["Product", "IndividualProduct"]);
639 }
640
641 #[test]
642 fn http_prefix_stripped() {
643 let html = r#"<html><body>
644<div itemscope itemtype="http://schema.org/Product">
645 <span itemprop="name">Widget</span>
646</div>
647</body></html>"#;
648
649 let out = MicrodataExtractor.extract(html).expect("extraction failed");
650 assert_eq!(out.nodes[0].types, vec!["Product"]);
651 }
652
653 #[test]
654 fn deeply_nested_scopes() {
655 let html = r#"<html><body>
656<div itemscope itemtype="https://schema.org/Product">
657 <span itemprop="name">Widget</span>
658 <div itemprop="offers" itemscope itemtype="https://schema.org/Offer">
659 <meta itemprop="price" content="29.99">
660 <div itemprop="seller" itemscope itemtype="https://schema.org/Organization">
661 <span itemprop="name">Acme</span>
662 <div itemprop="address" itemscope itemtype="https://schema.org/PostalAddress">
663 <span itemprop="addressCountry">US</span>
664 </div>
665 </div>
666 </div>
667</div>
668</body></html>"#;
669
670 let out = MicrodataExtractor.extract(html).expect("extraction failed");
671 assert_eq!(out.nodes.len(), 1);
672 if let SchemaValue::Node(offer) = &out.nodes[0].properties["offers"][0] {
673 if let SchemaValue::Node(seller) = &offer.properties["seller"][0] {
674 if let SchemaValue::Node(addr) = &seller.properties["address"][0] {
675 assert_eq!(addr.types, vec!["PostalAddress"]);
676 assert_eq!(
677 addr.properties["addressCountry"],
678 vec![SchemaValue::Text("US".into())]
679 );
680 } else {
681 panic!("Expected PostalAddress node");
682 }
683 } else {
684 panic!("Expected Organization node");
685 }
686 } else {
687 panic!("Expected Offer node");
688 }
689 }
690
691 #[test]
692 fn multiple_top_level_items() {
693 let html = r#"<html><body>
694<div itemscope itemtype="https://schema.org/Product">
695 <span itemprop="name">Widget A</span>
696</div>
697<div itemscope itemtype="https://schema.org/Product">
698 <span itemprop="name">Widget B</span>
699</div>
700</body></html>"#;
701
702 let out = MicrodataExtractor.extract(html).expect("extraction failed");
703 assert_eq!(out.nodes.len(), 2);
704 assert_eq!(
705 out.nodes[0].properties["name"],
706 vec![SchemaValue::Text("Widget A".into())]
707 );
708 assert_eq!(
709 out.nodes[1].properties["name"],
710 vec![SchemaValue::Text("Widget B".into())]
711 );
712 }
713
714 #[test]
715 fn no_microdata() {
716 let html = "<html><body><p>No microdata here</p></body></html>";
717 let out = MicrodataExtractor.extract(html).expect("extraction failed");
718 assert!(out.nodes.is_empty());
719 assert!(out.warnings.is_empty());
720 }
721
722 #[test]
723 fn itemscope_without_itemtype_warns() {
724 let html = r#"<html><body>
725<div itemscope>
726 <span itemprop="name">Something</span>
727</div>
728</body></html>"#;
729
730 let out = MicrodataExtractor.extract(html).expect("extraction failed");
731 assert_eq!(out.nodes.len(), 1);
732 assert!(out.nodes[0].types.is_empty());
733 assert!(out
734 .warnings
735 .iter()
736 .any(|w| w.code == WarningCode::EmptyType));
737 }
738
739 #[test]
740 fn itemprop_in_wrapper_div() {
741 let html = r#"<html><body>
743<div itemscope itemtype="https://schema.org/Product">
744 <div class="wrapper">
745 <div class="inner">
746 <span itemprop="name">Widget</span>
747 </div>
748 </div>
749</div>
750</body></html>"#;
751
752 let out = MicrodataExtractor.extract(html).expect("extraction failed");
753 assert_eq!(out.nodes.len(), 1);
754 assert_eq!(
755 out.nodes[0].properties["name"],
756 vec![SchemaValue::Text("Widget".into())]
757 );
758 }
759
760 #[test]
761 fn time_without_datetime_uses_text() {
762 let html = r#"<html><body>
763<div itemscope itemtype="https://schema.org/Event">
764 <time itemprop="startDate">June 15, 2024</time>
765</div>
766</body></html>"#;
767
768 let out = MicrodataExtractor.extract(html).expect("extraction failed");
769 assert_eq!(
770 out.nodes[0].properties["startDate"],
771 vec![SchemaValue::Text("June 15, 2024".into())]
772 );
773 }
774
775 #[test]
776 fn link_without_href_uses_text() {
777 let html = r#"<html><body>
778<div itemscope itemtype="https://schema.org/Product">
779 <a itemprop="url">Click here</a>
780</div>
781</body></html>"#;
782
783 let out = MicrodataExtractor.extract(html).expect("extraction failed");
784 assert_eq!(
785 out.nodes[0].properties["url"],
786 vec![SchemaValue::Text("Click here".into())]
787 );
788 }
789
790 #[test]
791 fn circular_itemref_does_not_loop() {
792 let html = r#"<html><body>
793<div id="a" itemscope itemtype="https://schema.org/Product" itemref="b">
794 <span itemprop="name">Product A</span>
795</div>
796<div id="b">
797 <span itemprop="description">Desc from B</span>
798</div>
799</body></html>"#;
800
801 let out = MicrodataExtractor.extract(html).expect("must not hang");
802 assert_eq!(out.nodes.len(), 1);
803 assert_eq!(
804 out.nodes[0].properties["description"],
805 vec![SchemaValue::Text("Desc from B".into())]
806 );
807 }
808
809 #[test]
810 fn self_referencing_itemref() {
811 let html = r#"<html><body>
813<div id="self" itemscope itemtype="https://schema.org/Product" itemref="self">
814 <span itemprop="name">Widget</span>
815</div>
816</body></html>"#;
817
818 let out = MicrodataExtractor.extract(html).expect("must not hang");
821 assert_eq!(out.nodes.len(), 1);
822 }
823
824 #[test]
825 fn itemref_multiple_ids() {
826 let html = r#"<html><body>
827<div itemscope itemtype="https://schema.org/Product" itemref="desc-block price-block">
828 <span itemprop="name">Widget</span>
829</div>
830<div id="desc-block">
831 <span itemprop="description">A fine widget</span>
832</div>
833<div id="price-block">
834 <meta itemprop="price" content="29.99">
835</div>
836</body></html>"#;
837
838 let out = MicrodataExtractor.extract(html).expect("extraction failed");
839 assert_eq!(out.nodes.len(), 1);
840 assert_eq!(
841 out.nodes[0].properties["description"],
842 vec![SchemaValue::Text("A fine widget".into())]
843 );
844 assert_eq!(
845 out.nodes[0].properties["price"],
846 vec![SchemaValue::Text("29.99".into())]
847 );
848 }
849
850 #[test]
851 fn empty_itemprop_attribute_skipped() {
852 let html = r#"<html><body>
853<div itemscope itemtype="https://schema.org/Product">
854 <span itemprop="">should be skipped</span>
855 <span itemprop="name">Widget</span>
856</div>
857</body></html>"#;
858
859 let out = MicrodataExtractor.extract(html).expect("extraction failed");
860 assert_eq!(out.nodes.len(), 1);
861 assert!(!out.nodes[0].properties.contains_key(""));
863 assert_eq!(
864 out.nodes[0].properties["name"],
865 vec![SchemaValue::Text("Widget".into())]
866 );
867 }
868
869 #[test]
870 fn object_element_data_attribute() {
871 let html = r#"<html><body>
872<div itemscope itemtype="https://schema.org/Product">
873 <span itemprop="name">Widget</span>
874 <object itemprop="image" data="https://example.com/widget.swf">fallback</object>
875</div>
876</body></html>"#;
877
878 let out = MicrodataExtractor.extract(html).expect("extraction failed");
879 assert_eq!(
880 out.nodes[0].properties["image"],
881 vec![SchemaValue::Url("https://example.com/widget.swf".into())]
882 );
883 }
884
885 #[test]
886 fn embed_element_src_attribute() {
887 let html = r#"<html><body>
888<div itemscope itemtype="https://schema.org/Product">
889 <span itemprop="name">Widget</span>
890 <embed itemprop="video" src="https://example.com/demo.mp4">
891</div>
892</body></html>"#;
893
894 let out = MicrodataExtractor.extract(html).expect("extraction failed");
895 assert_eq!(
896 out.nodes[0].properties["video"],
897 vec![SchemaValue::Url("https://example.com/demo.mp4".into())]
898 );
899 }
900
901 #[test]
902 fn source_element_src_attribute() {
903 let html = r#"<html><body>
904<div itemscope itemtype="https://schema.org/Product">
905 <span itemprop="name">Widget</span>
906 <source itemprop="audio" src="https://example.com/sound.mp3">
907</div>
908</body></html>"#;
909
910 let out = MicrodataExtractor.extract(html).expect("extraction failed");
911 assert_eq!(
912 out.nodes[0].properties["audio"],
913 vec![SchemaValue::Url("https://example.com/sound.mp3".into())]
914 );
915 }
916
917 #[test]
918 fn depth_exceeding_max_warns() {
919 let mut html = String::from("<html><body>");
921 let target = MAX_DEPTH + 2;
922 for i in 0..target {
923 html.push_str(&format!(
924 r#"<div itemprop="child" itemscope "#,
925 ));
926 html.push_str(&format!(
927 r#"itemtype="https://schema.org/Thing">"#,
928 ));
929 html.push_str(&format!(
930 r#"<span itemprop="name">L{i}</span>"#,
931 ));
932 }
933 for _ in 0..target {
934 html.push_str("</div>");
935 }
936 html.push_str("</body></html>");
937
938 let html = html.replacen(r#"itemprop="child" "#, "", 1);
940
941 let out = MicrodataExtractor
942 .extract(&html)
943 .expect("extraction failed");
944 assert!(
945 out.warnings
946 .iter()
947 .any(|w| w.message.contains("depth") || w.message.contains("Microdata")),
948 "should warn when exceeding MAX_DEPTH"
949 );
950 }
951
952 #[test]
953 fn empty_itemtype_attribute() {
954 let html = r#"<html><body>
955<div itemscope itemtype="">
956 <span itemprop="name">Something</span>
957</div>
958</body></html>"#;
959
960 let out = MicrodataExtractor.extract(html).expect("extraction failed");
961 assert_eq!(out.nodes.len(), 1);
962 assert!(out.nodes[0].types.is_empty());
963 assert!(out
964 .warnings
965 .iter()
966 .any(|w| w.code == WarningCode::EmptyType));
967 }
968
969 #[test]
970 fn meter_non_numeric_value_fallback() {
971 let html = r#"<html><body>
972<div itemscope itemtype="https://schema.org/Product">
973 <span itemprop="name">Widget</span>
974 <meter itemprop="score" value="not-a-number">High</meter>
975</div>
976</body></html>"#;
977
978 let out = MicrodataExtractor.extract(html).expect("extraction failed");
979 assert_eq!(
980 out.nodes[0].properties["score"],
981 vec![SchemaValue::Text("not-a-number".into())]
982 );
983 }
984
985 #[test]
986 fn img_empty_src_gives_empty_text() {
987 let html = r#"<html><body>
988<div itemscope itemtype="https://schema.org/Product">
989 <span itemprop="name">Widget</span>
990 <img itemprop="image" src="">
991</div>
992</body></html>"#;
993
994 let out = MicrodataExtractor.extract(html).expect("extraction failed");
995 assert_eq!(
996 out.nodes[0].properties["image"],
997 vec![SchemaValue::Text(String::new())]
998 );
999 }
1000
1001 #[test]
1002 fn itemref_to_element_with_itemprop() {
1003 let html = r#"<html><body>
1006<div itemscope itemtype="https://schema.org/Product" itemref="ext-name">
1007 <span itemprop="description">A fine widget</span>
1008</div>
1009<span id="ext-name" itemprop="name">Widget</span>
1010</body></html>"#;
1011
1012 let out = MicrodataExtractor.extract(html).expect("extraction failed");
1013 assert_eq!(out.nodes.len(), 1);
1014 assert_eq!(
1015 out.nodes[0].properties["name"],
1016 vec![SchemaValue::Text("Widget".into())]
1017 );
1018 }
1019
1020 #[test]
1021 fn unicode_preserved_in_values() {
1022 let html = r#"<html><body>
1023<div itemscope itemtype="https://schema.org/Product">
1024 <span itemprop="name">Gerät für Ökologie</span>
1025</div>
1026</body></html>"#;
1027
1028 let out = MicrodataExtractor.extract(html).expect("extraction failed");
1029 assert_eq!(out.nodes.len(), 1);
1030 assert_eq!(
1031 out.nodes[0].properties["name"],
1032 vec![SchemaValue::Text("Gerät für Ökologie".into())]
1033 );
1034 }
1035
1036 #[test]
1037 fn object_empty_data_gives_empty_text() {
1038 let html = r#"<html><body>
1039<div itemscope itemtype="https://schema.org/Product">
1040 <span itemprop="name">Widget</span>
1041 <object itemprop="image" data="">fallback</object>
1042</div>
1043</body></html>"#;
1044
1045 let out = MicrodataExtractor.extract(html).expect("extraction failed");
1046 assert_eq!(
1047 out.nodes[0].properties["image"],
1048 vec![SchemaValue::Text(String::new())]
1049 );
1050 }
1051}