Skip to main content

schemaorg_rs/extraction/
rdfa.rs

1//! `RDFa` Lite 1.1 extractor: parses `vocab`/`typeof`/`property` attributes.
2//!
3//! Implements [RDFa Lite 1.1](https://www.w3.org/TR/rdfa-lite/) - the 5-attribute
4//! subset designed for Schema.org: `vocab`, `typeof`, `property`, `resource`, `prefix`.
5//!
6//! ## Supported features
7//!
8//! - `vocab` attribute for setting the default vocabulary (e.g. `https://schema.org/`)
9//! - `typeof` for defining types
10//! - `property` for defining properties
11//! - `resource` for overriding the subject URI
12//! - `prefix` for namespace prefix mappings
13//! - Nested typed nodes
14//! - Content extraction from `content`, `href`, `src` attributes
15//!
16//! ## Not supported (full `RDFa` Core 1.1)
17//!
18//! - Complex CURIE expansion beyond simple `prefix:term`
19//! - `about`, `src`, `href` as subject identifiers
20//! - `rel` and `rev` properties
21//! - `@inlist` processing
22//! - Multiple interleaved vocabularies
23//! - XML `base` URI resolution
24//!
25//! This subset covers ~95% of real-world Schema.org `RDFa` usage.
26
27use std::borrow::Cow;
28
29use ego_tree::NodeRef;
30use indexmap::IndexMap;
31use scraper::node::Node;
32use scraper::Html;
33
34use crate::error::{ExtractionError, ExtractionWarning, WarningCode};
35use crate::types::{SchemaNode, SchemaValue, SourceFormat};
36
37use super::{classify_text_value, strip_schema_prefix, ExtractionOutput, Extractor};
38
39/// Maximum nesting depth.
40const MAX_DEPTH: usize = 20;
41
42/// Extracts Schema.org structured data from `RDFa` Lite 1.1 attributes.
43///
44/// # Examples
45///
46/// ```
47/// use schemaorg_rs::extraction::{Extractor, RdfaLiteExtractor};
48///
49/// let html = r#"<html><body>
50/// <div vocab="https://schema.org/" typeof="Product">
51/// <span property="name">Widget</span>
52/// </div>
53/// </body></html>"#;
54///
55/// let output = RdfaLiteExtractor.extract(html).unwrap();
56/// assert_eq!(output.nodes[0].types, vec!["Product"]);
57/// ```
58pub struct RdfaLiteExtractor;
59
60impl Extractor for RdfaLiteExtractor {
61    fn extract(&self, html: &str) -> Result<ExtractionOutput, ExtractionError> {
62        let document = Html::parse_document(html);
63        self.extract_from_document(&document)
64    }
65}
66
67impl RdfaLiteExtractor {
68    /// Extracts from an already-parsed document.
69    ///
70    /// # Errors
71    ///
72    /// Returns [`ExtractionError`] if a fatal error prevents extraction.
73    /// Most issues are captured as warnings in the returned output.
74    pub fn extract_from_document(
75        &self,
76        document: &Html,
77    ) -> Result<ExtractionOutput, ExtractionError> {
78        let mut warnings = Vec::new();
79        let mut nodes = Vec::new();
80
81        let context = RdfaContext {
82            vocab: None,
83            prefixes: IndexMap::new(),
84        };
85
86        // Walk the DOM tree starting from the root
87        for child in document.tree.root().children() {
88            walk_dom(child, &context, &mut nodes, &mut warnings, 0);
89        }
90
91        Ok(ExtractionOutput { nodes, warnings })
92    }
93}
94
95/// Context stack for `RDFa` processing.
96#[derive(Debug, Clone)]
97struct RdfaContext {
98    /// Current default vocabulary URI (e.g. `https://schema.org/`).
99    vocab: Option<String>,
100    /// Registered namespace prefixes (e.g. `schema` -> `https://schema.org/`).
101    prefixes: IndexMap<String, String>,
102}
103
104impl RdfaContext {
105    /// Creates an updated context if this element changes `vocab` or `prefix`.
106    /// Returns `None` if the context is unchanged (avoids cloning).
107    fn updated(&self, el: &scraper::node::Element) -> Option<Self> {
108        let has_vocab = el.attr("vocab").is_some();
109        let has_prefix = el.attr("prefix").is_some();
110
111        if !has_vocab && !has_prefix {
112            return None;
113        }
114
115        let mut ctx = self.clone();
116
117        if let Some(vocab) = el.attr("vocab") {
118            ctx.vocab = if vocab.is_empty() {
119                None
120            } else {
121                Some(ensure_trailing_slash(vocab))
122            };
123        }
124
125        if let Some(prefix_attr) = el.attr("prefix") {
126            parse_prefix_attr(prefix_attr, &mut ctx.prefixes);
127        }
128
129        Some(ctx)
130    }
131
132    /// Resolves a potentially prefixed term to a full URI, then strips known vocabulary prefixes.
133    fn resolve_term(&self, term: &str) -> String {
134        // Full URI: strip the vocabulary prefix if present
135        let stripped = strip_schema_prefix(term);
136        if matches!(stripped, Cow::Owned(_)) {
137            return stripped.into_owned();
138        }
139
140        // Try prefix:term expansion (e.g. "schema:Product")
141        if let Some(colon_pos) = term.find(':') {
142            let prefix = &term[..colon_pos];
143            let local = &term[colon_pos + 1..];
144            if let Some(ns_uri) = self.prefixes.get(prefix) {
145                let full = format!("{ns_uri}{local}");
146                return strip_schema_prefix(&full).into_owned();
147            }
148        }
149
150        term.to_string()
151    }
152}
153
154/// Walks the DOM tree, collecting `RDFa` Lite structured data.
155fn walk_dom(
156    node: NodeRef<'_, Node>,
157    parent_ctx: &RdfaContext,
158    nodes: &mut Vec<SchemaNode>,
159    warnings: &mut Vec<ExtractionWarning>,
160    depth: usize,
161) {
162    if depth > MAX_DEPTH {
163        return;
164    }
165
166    let Some(el) = node.value().as_element() else {
167        // Not an element - recurse into children (e.g. template nodes)
168        for child in node.children() {
169            walk_dom(child, parent_ctx, nodes, warnings, depth);
170        }
171        return;
172    };
173
174    let updated_ctx = parent_ctx.updated(el);
175    let ctx = updated_ctx.as_ref().unwrap_or(parent_ctx);
176
177    // Does this element define a new typed node?
178    if let Some(typeof_attr) = el.attr("typeof") {
179        let types: Vec<String> = typeof_attr
180            .split_whitespace()
181            .map(|t| ctx.resolve_term(t))
182            .collect();
183
184        if types.is_empty() {
185            warnings.push(ExtractionWarning {
186                message: "RDFa typeof attribute is empty".into(),
187                source_location: None,
188                code: WarningCode::EmptyType,
189            });
190        }
191
192        let mut properties: IndexMap<String, Vec<SchemaValue>> = IndexMap::new();
193
194        // Store resource as @id if present
195        if let Some(resource) = el.attr("resource") {
196            properties
197                .entry("@id".into())
198                .or_default()
199                .push(classify_text_value(resource));
200        }
201
202        // Collect properties from children
203        collect_rdfa_properties(node, ctx, &mut properties, warnings, depth + 1);
204
205        let schema_node = SchemaNode {
206            types,
207            properties,
208            source_format: SourceFormat::RdfaLite,
209            source_location: None,
210        };
211
212        nodes.push(schema_node);
213        return; // Children already processed by collect_rdfa_properties
214    }
215
216    // No typeof - continue walking children
217    for child in node.children() {
218        walk_dom(child, ctx, nodes, warnings, depth + 1);
219    }
220}
221
222/// Collects properties from children of a typed node.
223fn collect_rdfa_properties(
224    node: NodeRef<'_, Node>,
225    ctx: &RdfaContext,
226    properties: &mut IndexMap<String, Vec<SchemaValue>>,
227    warnings: &mut Vec<ExtractionWarning>,
228    depth: usize,
229) {
230    if depth > MAX_DEPTH {
231        return;
232    }
233
234    for child in node.children() {
235        visit_for_rdfa_props(child, ctx, properties, warnings, depth);
236    }
237}
238
239/// Visits a node looking for `RDFa` property attributes.
240fn visit_for_rdfa_props(
241    node: NodeRef<'_, Node>,
242    parent_ctx: &RdfaContext,
243    properties: &mut IndexMap<String, Vec<SchemaValue>>,
244    warnings: &mut Vec<ExtractionWarning>,
245    depth: usize,
246) {
247    if depth > MAX_DEPTH {
248        return;
249    }
250
251    let Some(el) = node.value().as_element() else {
252        return;
253    };
254
255    let updated_ctx = parent_ctx.updated(el);
256    let ctx = updated_ctx.as_ref().unwrap_or(parent_ctx);
257
258    // Does this element define a property?
259    if let Some(prop_attr) = el.attr("property") {
260        let prop_names: Vec<String> = prop_attr
261            .split_whitespace()
262            .map(|p| ctx.resolve_term(p))
263            .collect();
264
265        if prop_names.is_empty() {
266            return;
267        }
268
269        // Is this property also a new typed node?
270        if let Some(typeof_attr) = el.attr("typeof") {
271            let types: Vec<String> = typeof_attr
272                .split_whitespace()
273                .map(|t| ctx.resolve_term(t))
274                .collect();
275
276            let mut nested_props: IndexMap<String, Vec<SchemaValue>> = IndexMap::new();
277
278            if let Some(resource) = el.attr("resource") {
279                nested_props
280                    .entry("@id".into())
281                    .or_default()
282                    .push(classify_text_value(resource));
283            }
284
285            collect_rdfa_properties(node, ctx, &mut nested_props, warnings, depth + 1);
286
287            let nested_node = SchemaNode {
288                types,
289                properties: nested_props,
290                source_format: SourceFormat::RdfaLite,
291                source_location: None,
292            };
293
294            let value = SchemaValue::Node(Box::new(nested_node));
295            for name in &prop_names {
296                properties
297                    .entry(name.clone())
298                    .or_default()
299                    .push(value.clone());
300            }
301            return; // Children already consumed by collect_rdfa_properties
302        }
303
304        // Extract the value
305        let value = extract_rdfa_value(node, el);
306
307        for name in &prop_names {
308            properties
309                .entry(name.clone())
310                .or_default()
311                .push(value.clone());
312        }
313        return; // Property element owns its subtree
314    }
315
316    // Not a property - check for typeof (nested independent node)
317    if el.attr("typeof").is_some() {
318        // This is a new typed node that is NOT a property of the parent.
319        // We skip it here and let the top-level walk_dom handle it.
320        // But wait - it's nested inside an existing typed node. RDFa Lite doesn't
321        // have a clean way to express independent nested items like Microdata's
322        // itemprop-less itemscope. We skip to avoid double-counting.
323        return;
324    }
325
326    // No property, no typeof - recurse into children
327    for child in node.children() {
328        visit_for_rdfa_props(child, ctx, properties, warnings, depth + 1);
329    }
330}
331
332/// Extracts a value from an `RDFa` property element.
333fn extract_rdfa_value(node: NodeRef<'_, Node>, el: &scraper::node::Element) -> SchemaValue {
334    let tag = el.name();
335
336    // content attribute takes highest priority
337    if let Some(content) = el.attr("content") {
338        return classify_text_value(content);
339    }
340
341    // resource attribute -> URL/Text
342    if let Some(resource) = el.attr("resource") {
343        return classify_text_value(resource);
344    }
345
346    // href on links
347    if let Some(href) = el.attr("href") {
348        match tag {
349            "a" | "link" | "area" => return SchemaValue::Url(href.to_string()),
350            _ => return classify_text_value(href),
351        }
352    }
353
354    // src on media elements
355    if let Some(src) = el.attr("src") {
356        match tag {
357            "img" | "audio" | "video" | "source" | "embed" => {
358                return SchemaValue::Url(src.to_string())
359            }
360            _ => return classify_text_value(src),
361        }
362    }
363
364    // datetime on <time> elements
365    if tag == "time" {
366        if let Some(datetime) = el.attr("datetime") {
367            return SchemaValue::DateTime(datetime.to_string());
368        }
369    }
370
371    // data element value
372    if tag == "data" {
373        if let Some(val) = el.attr("value") {
374            return classify_text_value(val);
375        }
376    }
377
378    // Fall back to text content
379    let text = collect_text_content(node);
380    let trimmed = text.trim().to_string();
381    classify_text_value(&trimmed)
382}
383
384/// Collects text content from a node and all its descendants.
385fn collect_text_content(node: NodeRef<'_, Node>) -> String {
386    let mut text = String::new();
387    for descendant in node.descendants() {
388        if let Some(t) = descendant.value().as_text() {
389            text.push_str(t);
390        }
391    }
392    text
393}
394
395/// Parses the `prefix` attribute into prefix -> URI mappings.
396///
397/// Format: `prefix: URI prefix2: URI2` (space-separated pairs).
398fn parse_prefix_attr(attr: &str, prefixes: &mut IndexMap<String, String>) {
399    let tokens: Vec<&str> = attr.split_whitespace().collect();
400    let mut i = 0;
401    while i + 1 < tokens.len() {
402        let prefix = tokens[i];
403        let uri = tokens[i + 1];
404        if let Some(stripped) = prefix.strip_suffix(':') {
405            prefixes.insert(stripped.to_string(), uri.to_string());
406            i += 2;
407        } else {
408            i += 1;
409        }
410    }
411}
412
413/// Ensures a vocabulary URI ends with a trailing `/`.
414fn ensure_trailing_slash(uri: &str) -> String {
415    if uri.ends_with('/') || uri.ends_with('#') {
416        uri.to_string()
417    } else {
418        format!("{uri}/")
419    }
420}
421
422#[cfg(test)]
423mod tests {
424    use pretty_assertions::assert_eq;
425
426    use super::*;
427
428    #[test]
429    fn basic_product() {
430        let html = r#"<html><body>
431<div vocab="https://schema.org/" typeof="Product">
432  <span property="name">Widget</span>
433  <span property="description">A great widget</span>
434</div>
435</body></html>"#;
436
437        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
438        assert_eq!(out.nodes.len(), 1);
439        assert_eq!(out.nodes[0].types, vec!["Product"]);
440        assert_eq!(out.nodes[0].source_format, SourceFormat::RdfaLite);
441        assert_eq!(
442            out.nodes[0].properties["name"],
443            vec![SchemaValue::Text("Widget".into())]
444        );
445        assert_eq!(
446            out.nodes[0].properties["description"],
447            vec![SchemaValue::Text("A great widget".into())]
448        );
449    }
450
451    #[test]
452    fn nested_typed_property() {
453        let html = r#"<html><body>
454<div vocab="https://schema.org/" typeof="Product">
455  <span property="name">Widget</span>
456  <div property="offers" typeof="Offer">
457    <span property="priceCurrency">USD</span>
458    <meta property="price" content="29.99">
459  </div>
460</div>
461</body></html>"#;
462
463        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
464        assert_eq!(out.nodes.len(), 1);
465        let offers = &out.nodes[0].properties["offers"];
466        assert_eq!(offers.len(), 1);
467        if let SchemaValue::Node(offer) = &offers[0] {
468            assert_eq!(offer.types, vec!["Offer"]);
469            assert_eq!(
470                offer.properties["priceCurrency"],
471                vec![SchemaValue::Text("USD".into())]
472            );
473            assert_eq!(
474                offer.properties["price"],
475                vec![SchemaValue::Text("29.99".into())]
476            );
477        } else {
478            panic!("Expected nested Node for offers");
479        }
480    }
481
482    #[test]
483    fn content_attribute() {
484        let html = r#"<html><body>
485<div vocab="https://schema.org/" typeof="Product">
486  <meta property="name" content="Widget">
487</div>
488</body></html>"#;
489
490        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
491        assert_eq!(
492            out.nodes[0].properties["name"],
493            vec![SchemaValue::Text("Widget".into())]
494        );
495    }
496
497    #[test]
498    fn href_as_url() {
499        let html = r#"<html><body>
500<div vocab="https://schema.org/" typeof="Product">
501  <span property="name">Widget</span>
502  <a property="url" href="https://example.com/widget">Link</a>
503</div>
504</body></html>"#;
505
506        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
507        assert_eq!(
508            out.nodes[0].properties["url"],
509            vec![SchemaValue::Url("https://example.com/widget".into())]
510        );
511    }
512
513    #[test]
514    fn img_src_as_url() {
515        let html = r#"<html><body>
516<div vocab="https://schema.org/" typeof="Product">
517  <span property="name">Widget</span>
518  <img property="image" src="https://example.com/img.jpg">
519</div>
520</body></html>"#;
521
522        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
523        assert_eq!(
524            out.nodes[0].properties["image"],
525            vec![SchemaValue::Url("https://example.com/img.jpg".into())]
526        );
527    }
528
529    #[test]
530    fn time_datetime() {
531        let html = r#"<html><body>
532<div vocab="https://schema.org/" typeof="Event">
533  <span property="name">Concert</span>
534  <time property="startDate" datetime="2024-06-15T19:00:00">June 15</time>
535</div>
536</body></html>"#;
537
538        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
539        assert_eq!(
540            out.nodes[0].properties["startDate"],
541            vec![SchemaValue::DateTime("2024-06-15T19:00:00".into())]
542        );
543    }
544
545    #[test]
546    fn resource_as_id() {
547        let html = r#"<html><body>
548<div vocab="https://schema.org/" typeof="Product" resource="https://example.com/product/1">
549  <span property="name">Widget</span>
550</div>
551</body></html>"#;
552
553        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
554        assert_eq!(
555            out.nodes[0].properties["@id"],
556            vec![SchemaValue::Url("https://example.com/product/1".into())]
557        );
558    }
559
560    #[test]
561    fn vocab_inheritance() {
562        let html = r#"<html vocab="https://schema.org/"><body>
563<div typeof="Product">
564  <span property="name">Widget</span>
565</div>
566</body></html>"#;
567
568        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
569        assert_eq!(out.nodes.len(), 1);
570        assert_eq!(out.nodes[0].types, vec!["Product"]);
571    }
572
573    #[test]
574    fn prefix_resolution() {
575        let html = r#"<html prefix="schema: https://schema.org/"><body>
576<div vocab="https://schema.org/" typeof="schema:Product">
577  <span property="schema:name">Widget</span>
578</div>
579</body></html>"#;
580
581        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
582        assert_eq!(out.nodes.len(), 1);
583        assert_eq!(out.nodes[0].types, vec!["Product"]);
584        assert_eq!(
585            out.nodes[0].properties["name"],
586            vec![SchemaValue::Text("Widget".into())]
587        );
588    }
589
590    #[test]
591    fn multiple_types() {
592        let html = r#"<html><body>
593<div vocab="https://schema.org/" typeof="Product IndividualProduct">
594  <span property="name">Widget</span>
595</div>
596</body></html>"#;
597
598        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
599        assert_eq!(out.nodes[0].types, vec!["Product", "IndividualProduct"]);
600    }
601
602    #[test]
603    fn multiple_top_level_items() {
604        let html = r#"<html><body>
605<div vocab="https://schema.org/" typeof="Product">
606  <span property="name">Widget A</span>
607</div>
608<div vocab="https://schema.org/" typeof="Article">
609  <span property="name">Article B</span>
610</div>
611</body></html>"#;
612
613        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
614        assert_eq!(out.nodes.len(), 2);
615        assert_eq!(out.nodes[0].types, vec!["Product"]);
616        assert_eq!(out.nodes[1].types, vec!["Article"]);
617    }
618
619    #[test]
620    fn no_rdfa() {
621        let html = "<html><body><p>No RDFa here</p></body></html>";
622        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
623        assert!(out.nodes.is_empty());
624        assert!(out.warnings.is_empty());
625    }
626
627    #[test]
628    fn deep_nesting() {
629        let html = r#"<html><body>
630<div vocab="https://schema.org/" typeof="Product">
631  <span property="name">Widget</span>
632  <div property="offers" typeof="Offer">
633    <meta property="price" content="29.99">
634    <div property="seller" typeof="Organization">
635      <span property="name">Acme</span>
636    </div>
637  </div>
638</div>
639</body></html>"#;
640
641        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
642        assert_eq!(out.nodes.len(), 1);
643        if let SchemaValue::Node(offer) = &out.nodes[0].properties["offers"][0] {
644            assert_eq!(offer.types, vec!["Offer"]);
645            if let SchemaValue::Node(seller) = &offer.properties["seller"][0] {
646                assert_eq!(seller.types, vec!["Organization"]);
647                assert_eq!(
648                    seller.properties["name"],
649                    vec![SchemaValue::Text("Acme".into())]
650                );
651            } else {
652                panic!("Expected Organization node");
653            }
654        } else {
655            panic!("Expected Offer node");
656        }
657    }
658
659    #[test]
660    fn property_in_wrapper_div() {
661        let html = r#"<html><body>
662<div vocab="https://schema.org/" typeof="Product">
663  <div class="wrapper">
664    <span property="name">Widget</span>
665  </div>
666</div>
667</body></html>"#;
668
669        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
670        assert_eq!(
671            out.nodes[0].properties["name"],
672            vec![SchemaValue::Text("Widget".into())]
673        );
674    }
675
676    #[test]
677    fn http_vocab() {
678        let html = r#"<html><body>
679<div vocab="http://schema.org/" typeof="Product">
680  <span property="name">Widget</span>
681</div>
682</body></html>"#;
683
684        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
685        assert_eq!(out.nodes[0].types, vec!["Product"]);
686    }
687
688    #[test]
689    fn parse_prefix_attr_works() {
690        let mut prefixes = IndexMap::new();
691        parse_prefix_attr(
692            "schema: https://schema.org/ og: https://ogp.me/ns#",
693            &mut prefixes,
694        );
695        assert_eq!(prefixes["schema"], "https://schema.org/");
696        assert_eq!(prefixes["og"], "https://ogp.me/ns#");
697    }
698
699    #[test]
700    fn empty_vocab_resets_vocabulary() {
701        // An empty vocab="" should reset the vocabulary to None
702        let html = r#"<html vocab="https://schema.org/"><body>
703<div typeof="Product">
704  <span property="name">Outer</span>
705  <div vocab="">
706    <div typeof="CustomThing">
707      <span property="label">Inner</span>
708    </div>
709  </div>
710</div>
711</body></html>"#;
712
713        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
714        // The outer Product should be extracted
715        assert!(out
716            .nodes
717            .iter()
718            .any(|n| n.types.contains(&"Product".to_string())));
719    }
720
721    #[test]
722    fn depth_exceeding_max_truncates_silently() {
723        // Build HTML with MAX_DEPTH + 2 nested typeof elements
724        let mut html = String::from(r#"<html><body><div vocab="https://schema.org/">"#);
725        let target = MAX_DEPTH + 2;
726        for i in 0..target {
727            html.push_str(&format!(
728                r#"<div property="child" typeof="Thing"><span property="name">L{i}</span>"#
729            ));
730        }
731        for _ in 0..target {
732            html.push_str("</div>");
733        }
734        html.push_str("</div></body></html>");
735
736        // Remove the first property="child" to make the outermost a top-level item
737        let html = html.replacen(r#"property="child" "#, "", 1);
738
739        let out = RdfaLiteExtractor.extract(&html).expect("extraction failed");
740        // Should extract without crashing even if deep nesting is truncated
741        assert!(!out.nodes.is_empty());
742    }
743
744    #[test]
745    fn empty_typeof_warns() {
746        let html = r#"<html><body>
747<div vocab="https://schema.org/" typeof="">
748  <span property="name">Something</span>
749</div>
750</body></html>"#;
751
752        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
753        assert!(
754            out.warnings
755                .iter()
756                .any(|w| w.code == WarningCode::EmptyType),
757            "empty typeof should produce EmptyType warning"
758        );
759    }
760
761    #[test]
762    fn data_element_with_value() {
763        let html = r#"<html><body>
764<div vocab="https://schema.org/" typeof="Product">
765  <span property="name">Widget</span>
766  <data property="sku" value="12345">Product SKU</data>
767</div>
768</body></html>"#;
769
770        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
771        assert_eq!(
772            out.nodes[0].properties["sku"],
773            vec![SchemaValue::Text("12345".into())]
774        );
775    }
776
777    #[test]
778    fn property_with_empty_text() {
779        let html = r#"<html><body>
780<div vocab="https://schema.org/" typeof="Product">
781  <span property="name"></span>
782</div>
783</body></html>"#;
784
785        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
786        assert_eq!(
787            out.nodes[0].properties["name"],
788            vec![SchemaValue::Text(String::new())]
789        );
790    }
791
792    #[test]
793    fn typeof_without_vocab() {
794        // typeof without vocab in ancestor chain -- types should be preserved as-is
795        let html = r#"<html><body>
796<div typeof="Product">
797  <span property="name">Widget</span>
798</div>
799</body></html>"#;
800
801        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
802        assert_eq!(out.nodes.len(), 1);
803        assert_eq!(out.nodes[0].types, vec!["Product"]);
804    }
805
806    #[test]
807    fn content_attribute_with_url_value() {
808        let html = r#"<html><body>
809<div vocab="https://schema.org/" typeof="Product">
810  <meta property="url" content="https://example.com/product">
811</div>
812</body></html>"#;
813
814        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
815        assert_eq!(
816            out.nodes[0].properties["url"],
817            vec![SchemaValue::Url("https://example.com/product".into())]
818        );
819    }
820
821    #[test]
822    fn resource_on_nested_property() {
823        let html = r#"<html><body>
824<div vocab="https://schema.org/" typeof="Product">
825  <span property="name">Widget</span>
826  <div property="offers" typeof="Offer" resource="https://example.com/offer/1">
827    <span property="priceCurrency">USD</span>
828  </div>
829</div>
830</body></html>"#;
831
832        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
833        let offers = &out.nodes[0].properties["offers"];
834        if let SchemaValue::Node(offer) = &offers[0] {
835            assert_eq!(
836                offer.properties["@id"],
837                vec![SchemaValue::Url("https://example.com/offer/1".into())]
838            );
839        } else {
840            panic!("Expected nested Offer node");
841        }
842    }
843
844    #[test]
845    fn nested_prefix_declarations() {
846        let html = r#"<html prefix="schema: https://schema.org/"><body>
847<div prefix="og: https://ogp.me/ns#" vocab="https://schema.org/" typeof="Product">
848  <span property="name">Widget</span>
849</div>
850</body></html>"#;
851
852        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
853        assert_eq!(out.nodes.len(), 1);
854        assert_eq!(out.nodes[0].types, vec!["Product"]);
855    }
856
857    #[test]
858    fn independent_typeof_nested_in_typed_node() {
859        // A typeof inside another typeof WITHOUT property attribute
860        // should not be double-counted or added as a property of the parent.
861        let html = r#"<html><body>
862<div vocab="https://schema.org/" typeof="WebPage">
863  <span property="name">My Page</span>
864  <div typeof="Organization">
865    <span property="name">Acme Corp</span>
866  </div>
867</div>
868</body></html>"#;
869
870        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
871        // The WebPage should be extracted as a top-level node.
872        // The Organization (without property attribute) is skipped
873        // by the current implementation to avoid double-counting.
874        assert!(out
875            .nodes
876            .iter()
877            .any(|n| n.types.contains(&"WebPage".to_string())));
878    }
879
880    #[test]
881    fn time_element_without_datetime() {
882        let html = r#"<html><body>
883<div vocab="https://schema.org/" typeof="Event">
884  <span property="name">Concert</span>
885  <time property="startDate">June 15, 2024</time>
886</div>
887</body></html>"#;
888
889        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
890        // Without datetime attribute, should fall back to text content
891        assert_eq!(
892            out.nodes[0].properties["startDate"],
893            vec![SchemaValue::Text("June 15, 2024".into())]
894        );
895    }
896
897    #[test]
898    fn unicode_preserved_in_values() {
899        let html = r#"<html><body>
900<div vocab="https://schema.org/" typeof="Product">
901  <span property="name">Gerät für Ökologie</span>
902</div>
903</body></html>"#;
904
905        let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
906        assert_eq!(out.nodes.len(), 1);
907        assert_eq!(
908            out.nodes[0].properties["name"],
909            vec![SchemaValue::Text("Gerät für Ökologie".into())]
910        );
911    }
912}