use std::borrow::Cow;
use ego_tree::NodeRef;
use indexmap::IndexMap;
use scraper::node::Node;
use scraper::Html;
use crate::error::{ExtractionError, ExtractionWarning, WarningCode};
use crate::types::{SchemaNode, SchemaValue, SourceFormat};
use super::{classify_text_value, strip_schema_prefix, ExtractionOutput, Extractor};
const MAX_DEPTH: usize = 20;
pub struct RdfaLiteExtractor;
impl Extractor for RdfaLiteExtractor {
fn extract(&self, html: &str) -> Result<ExtractionOutput, ExtractionError> {
let document = Html::parse_document(html);
self.extract_from_document(&document)
}
}
impl RdfaLiteExtractor {
pub fn extract_from_document(
&self,
document: &Html,
) -> Result<ExtractionOutput, ExtractionError> {
let mut warnings = Vec::new();
let mut nodes = Vec::new();
let context = RdfaContext {
vocab: None,
prefixes: IndexMap::new(),
};
for child in document.tree.root().children() {
walk_dom(child, &context, &mut nodes, &mut warnings, 0);
}
Ok(ExtractionOutput { nodes, warnings })
}
}
#[derive(Debug, Clone)]
struct RdfaContext {
vocab: Option<String>,
prefixes: IndexMap<String, String>,
}
impl RdfaContext {
fn updated(&self, el: &scraper::node::Element) -> Option<Self> {
let has_vocab = el.attr("vocab").is_some();
let has_prefix = el.attr("prefix").is_some();
if !has_vocab && !has_prefix {
return None;
}
let mut ctx = self.clone();
if let Some(vocab) = el.attr("vocab") {
ctx.vocab = if vocab.is_empty() {
None
} else {
Some(ensure_trailing_slash(vocab))
};
}
if let Some(prefix_attr) = el.attr("prefix") {
parse_prefix_attr(prefix_attr, &mut ctx.prefixes);
}
Some(ctx)
}
fn resolve_term(&self, term: &str) -> String {
let stripped = strip_schema_prefix(term);
if matches!(stripped, Cow::Owned(_)) {
return stripped.into_owned();
}
if let Some(colon_pos) = term.find(':') {
let prefix = &term[..colon_pos];
let local = &term[colon_pos + 1..];
if let Some(ns_uri) = self.prefixes.get(prefix) {
let full = format!("{ns_uri}{local}");
return strip_schema_prefix(&full).into_owned();
}
}
term.to_string()
}
}
fn walk_dom(
node: NodeRef<'_, Node>,
parent_ctx: &RdfaContext,
nodes: &mut Vec<SchemaNode>,
warnings: &mut Vec<ExtractionWarning>,
depth: usize,
) {
if depth > MAX_DEPTH {
return;
}
let Some(el) = node.value().as_element() else {
for child in node.children() {
walk_dom(child, parent_ctx, nodes, warnings, depth);
}
return;
};
let updated_ctx = parent_ctx.updated(el);
let ctx = updated_ctx.as_ref().unwrap_or(parent_ctx);
if let Some(typeof_attr) = el.attr("typeof") {
let types: Vec<String> = typeof_attr
.split_whitespace()
.map(|t| ctx.resolve_term(t))
.collect();
if types.is_empty() {
warnings.push(ExtractionWarning {
message: "RDFa typeof attribute is empty".into(),
source_location: None,
code: WarningCode::EmptyType,
});
}
let mut properties: IndexMap<String, Vec<SchemaValue>> = IndexMap::new();
if let Some(resource) = el.attr("resource") {
properties
.entry("@id".into())
.or_default()
.push(classify_text_value(resource));
}
collect_rdfa_properties(node, ctx, &mut properties, warnings, depth + 1);
let schema_node = SchemaNode {
types,
properties,
source_format: SourceFormat::RdfaLite,
source_location: None,
};
nodes.push(schema_node);
return; }
for child in node.children() {
walk_dom(child, ctx, nodes, warnings, depth + 1);
}
}
fn collect_rdfa_properties(
node: NodeRef<'_, Node>,
ctx: &RdfaContext,
properties: &mut IndexMap<String, Vec<SchemaValue>>,
warnings: &mut Vec<ExtractionWarning>,
depth: usize,
) {
if depth > MAX_DEPTH {
return;
}
for child in node.children() {
visit_for_rdfa_props(child, ctx, properties, warnings, depth);
}
}
fn visit_for_rdfa_props(
node: NodeRef<'_, Node>,
parent_ctx: &RdfaContext,
properties: &mut IndexMap<String, Vec<SchemaValue>>,
warnings: &mut Vec<ExtractionWarning>,
depth: usize,
) {
if depth > MAX_DEPTH {
return;
}
let Some(el) = node.value().as_element() else {
return;
};
let updated_ctx = parent_ctx.updated(el);
let ctx = updated_ctx.as_ref().unwrap_or(parent_ctx);
if let Some(prop_attr) = el.attr("property") {
let prop_names: Vec<String> = prop_attr
.split_whitespace()
.map(|p| ctx.resolve_term(p))
.collect();
if prop_names.is_empty() {
return;
}
if let Some(typeof_attr) = el.attr("typeof") {
let types: Vec<String> = typeof_attr
.split_whitespace()
.map(|t| ctx.resolve_term(t))
.collect();
let mut nested_props: IndexMap<String, Vec<SchemaValue>> = IndexMap::new();
if let Some(resource) = el.attr("resource") {
nested_props
.entry("@id".into())
.or_default()
.push(classify_text_value(resource));
}
collect_rdfa_properties(node, ctx, &mut nested_props, warnings, depth + 1);
let nested_node = SchemaNode {
types,
properties: nested_props,
source_format: SourceFormat::RdfaLite,
source_location: None,
};
let value = SchemaValue::Node(Box::new(nested_node));
for name in &prop_names {
properties
.entry(name.clone())
.or_default()
.push(value.clone());
}
return; }
let value = extract_rdfa_value(node, el);
for name in &prop_names {
properties
.entry(name.clone())
.or_default()
.push(value.clone());
}
return; }
if el.attr("typeof").is_some() {
return;
}
for child in node.children() {
visit_for_rdfa_props(child, ctx, properties, warnings, depth + 1);
}
}
fn extract_rdfa_value(node: NodeRef<'_, Node>, el: &scraper::node::Element) -> SchemaValue {
let tag = el.name();
if let Some(content) = el.attr("content") {
return classify_text_value(content);
}
if let Some(resource) = el.attr("resource") {
return classify_text_value(resource);
}
if let Some(href) = el.attr("href") {
match tag {
"a" | "link" | "area" => return SchemaValue::Url(href.to_string()),
_ => return classify_text_value(href),
}
}
if let Some(src) = el.attr("src") {
match tag {
"img" | "audio" | "video" | "source" | "embed" => {
return SchemaValue::Url(src.to_string())
}
_ => return classify_text_value(src),
}
}
if tag == "time" {
if let Some(datetime) = el.attr("datetime") {
return SchemaValue::DateTime(datetime.to_string());
}
}
if tag == "data" {
if let Some(val) = el.attr("value") {
return classify_text_value(val);
}
}
let text = collect_text_content(node);
let trimmed = text.trim().to_string();
classify_text_value(&trimmed)
}
fn collect_text_content(node: NodeRef<'_, Node>) -> String {
let mut text = String::new();
for descendant in node.descendants() {
if let Some(t) = descendant.value().as_text() {
text.push_str(t);
}
}
text
}
fn parse_prefix_attr(attr: &str, prefixes: &mut IndexMap<String, String>) {
let tokens: Vec<&str> = attr.split_whitespace().collect();
let mut i = 0;
while i + 1 < tokens.len() {
let prefix = tokens[i];
let uri = tokens[i + 1];
if let Some(stripped) = prefix.strip_suffix(':') {
prefixes.insert(stripped.to_string(), uri.to_string());
i += 2;
} else {
i += 1;
}
}
}
fn ensure_trailing_slash(uri: &str) -> String {
if uri.ends_with('/') || uri.ends_with('#') {
uri.to_string()
} else {
format!("{uri}/")
}
}
#[cfg(test)]
mod tests {
use pretty_assertions::assert_eq;
use super::*;
#[test]
fn basic_product() {
let html = r#"<html><body>
<div vocab="https://schema.org/" typeof="Product">
<span property="name">Widget</span>
<span property="description">A great widget</span>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(out.nodes.len(), 1);
assert_eq!(out.nodes[0].types, vec!["Product"]);
assert_eq!(out.nodes[0].source_format, SourceFormat::RdfaLite);
assert_eq!(
out.nodes[0].properties["name"],
vec![SchemaValue::Text("Widget".into())]
);
assert_eq!(
out.nodes[0].properties["description"],
vec![SchemaValue::Text("A great widget".into())]
);
}
#[test]
fn nested_typed_property() {
let html = r#"<html><body>
<div vocab="https://schema.org/" typeof="Product">
<span property="name">Widget</span>
<div property="offers" typeof="Offer">
<span property="priceCurrency">USD</span>
<meta property="price" content="29.99">
</div>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(out.nodes.len(), 1);
let offers = &out.nodes[0].properties["offers"];
assert_eq!(offers.len(), 1);
if let SchemaValue::Node(offer) = &offers[0] {
assert_eq!(offer.types, vec!["Offer"]);
assert_eq!(
offer.properties["priceCurrency"],
vec![SchemaValue::Text("USD".into())]
);
assert_eq!(
offer.properties["price"],
vec![SchemaValue::Text("29.99".into())]
);
} else {
panic!("Expected nested Node for offers");
}
}
#[test]
fn content_attribute() {
let html = r#"<html><body>
<div vocab="https://schema.org/" typeof="Product">
<meta property="name" content="Widget">
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(
out.nodes[0].properties["name"],
vec![SchemaValue::Text("Widget".into())]
);
}
#[test]
fn href_as_url() {
let html = r#"<html><body>
<div vocab="https://schema.org/" typeof="Product">
<span property="name">Widget</span>
<a property="url" href="https://example.com/widget">Link</a>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(
out.nodes[0].properties["url"],
vec![SchemaValue::Url("https://example.com/widget".into())]
);
}
#[test]
fn img_src_as_url() {
let html = r#"<html><body>
<div vocab="https://schema.org/" typeof="Product">
<span property="name">Widget</span>
<img property="image" src="https://example.com/img.jpg">
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(
out.nodes[0].properties["image"],
vec![SchemaValue::Url("https://example.com/img.jpg".into())]
);
}
#[test]
fn time_datetime() {
let html = r#"<html><body>
<div vocab="https://schema.org/" typeof="Event">
<span property="name">Concert</span>
<time property="startDate" datetime="2024-06-15T19:00:00">June 15</time>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(
out.nodes[0].properties["startDate"],
vec![SchemaValue::DateTime("2024-06-15T19:00:00".into())]
);
}
#[test]
fn resource_as_id() {
let html = r#"<html><body>
<div vocab="https://schema.org/" typeof="Product" resource="https://example.com/product/1">
<span property="name">Widget</span>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(
out.nodes[0].properties["@id"],
vec![SchemaValue::Url("https://example.com/product/1".into())]
);
}
#[test]
fn vocab_inheritance() {
let html = r#"<html vocab="https://schema.org/"><body>
<div typeof="Product">
<span property="name">Widget</span>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(out.nodes.len(), 1);
assert_eq!(out.nodes[0].types, vec!["Product"]);
}
#[test]
fn prefix_resolution() {
let html = r#"<html prefix="schema: https://schema.org/"><body>
<div vocab="https://schema.org/" typeof="schema:Product">
<span property="schema:name">Widget</span>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(out.nodes.len(), 1);
assert_eq!(out.nodes[0].types, vec!["Product"]);
assert_eq!(
out.nodes[0].properties["name"],
vec![SchemaValue::Text("Widget".into())]
);
}
#[test]
fn multiple_types() {
let html = r#"<html><body>
<div vocab="https://schema.org/" typeof="Product IndividualProduct">
<span property="name">Widget</span>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(out.nodes[0].types, vec!["Product", "IndividualProduct"]);
}
#[test]
fn multiple_top_level_items() {
let html = r#"<html><body>
<div vocab="https://schema.org/" typeof="Product">
<span property="name">Widget A</span>
</div>
<div vocab="https://schema.org/" typeof="Article">
<span property="name">Article B</span>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(out.nodes.len(), 2);
assert_eq!(out.nodes[0].types, vec!["Product"]);
assert_eq!(out.nodes[1].types, vec!["Article"]);
}
#[test]
fn no_rdfa() {
let html = "<html><body><p>No RDFa here</p></body></html>";
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert!(out.nodes.is_empty());
assert!(out.warnings.is_empty());
}
#[test]
fn deep_nesting() {
let html = r#"<html><body>
<div vocab="https://schema.org/" typeof="Product">
<span property="name">Widget</span>
<div property="offers" typeof="Offer">
<meta property="price" content="29.99">
<div property="seller" typeof="Organization">
<span property="name">Acme</span>
</div>
</div>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(out.nodes.len(), 1);
if let SchemaValue::Node(offer) = &out.nodes[0].properties["offers"][0] {
assert_eq!(offer.types, vec!["Offer"]);
if let SchemaValue::Node(seller) = &offer.properties["seller"][0] {
assert_eq!(seller.types, vec!["Organization"]);
assert_eq!(
seller.properties["name"],
vec![SchemaValue::Text("Acme".into())]
);
} else {
panic!("Expected Organization node");
}
} else {
panic!("Expected Offer node");
}
}
#[test]
fn property_in_wrapper_div() {
let html = r#"<html><body>
<div vocab="https://schema.org/" typeof="Product">
<div class="wrapper">
<span property="name">Widget</span>
</div>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(
out.nodes[0].properties["name"],
vec![SchemaValue::Text("Widget".into())]
);
}
#[test]
fn http_vocab() {
let html = r#"<html><body>
<div vocab="http://schema.org/" typeof="Product">
<span property="name">Widget</span>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(out.nodes[0].types, vec!["Product"]);
}
#[test]
fn parse_prefix_attr_works() {
let mut prefixes = IndexMap::new();
parse_prefix_attr(
"schema: https://schema.org/ og: https://ogp.me/ns#",
&mut prefixes,
);
assert_eq!(prefixes["schema"], "https://schema.org/");
assert_eq!(prefixes["og"], "https://ogp.me/ns#");
}
#[test]
fn empty_vocab_resets_vocabulary() {
let html = r#"<html vocab="https://schema.org/"><body>
<div typeof="Product">
<span property="name">Outer</span>
<div vocab="">
<div typeof="CustomThing">
<span property="label">Inner</span>
</div>
</div>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert!(out
.nodes
.iter()
.any(|n| n.types.contains(&"Product".to_string())));
}
#[test]
fn depth_exceeding_max_truncates_silently() {
let mut html = String::from(r#"<html><body><div vocab="https://schema.org/">"#);
let target = MAX_DEPTH + 2;
for i in 0..target {
html.push_str(&format!(
r#"<div property="child" typeof="Thing"><span property="name">L{i}</span>"#
));
}
for _ in 0..target {
html.push_str("</div>");
}
html.push_str("</div></body></html>");
let html = html.replacen(r#"property="child" "#, "", 1);
let out = RdfaLiteExtractor.extract(&html).expect("extraction failed");
assert!(!out.nodes.is_empty());
}
#[test]
fn empty_typeof_warns() {
let html = r#"<html><body>
<div vocab="https://schema.org/" typeof="">
<span property="name">Something</span>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert!(
out.warnings
.iter()
.any(|w| w.code == WarningCode::EmptyType),
"empty typeof should produce EmptyType warning"
);
}
#[test]
fn data_element_with_value() {
let html = r#"<html><body>
<div vocab="https://schema.org/" typeof="Product">
<span property="name">Widget</span>
<data property="sku" value="12345">Product SKU</data>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(
out.nodes[0].properties["sku"],
vec![SchemaValue::Text("12345".into())]
);
}
#[test]
fn property_with_empty_text() {
let html = r#"<html><body>
<div vocab="https://schema.org/" typeof="Product">
<span property="name"></span>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(
out.nodes[0].properties["name"],
vec![SchemaValue::Text(String::new())]
);
}
#[test]
fn typeof_without_vocab() {
let html = r#"<html><body>
<div typeof="Product">
<span property="name">Widget</span>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(out.nodes.len(), 1);
assert_eq!(out.nodes[0].types, vec!["Product"]);
}
#[test]
fn content_attribute_with_url_value() {
let html = r#"<html><body>
<div vocab="https://schema.org/" typeof="Product">
<meta property="url" content="https://example.com/product">
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(
out.nodes[0].properties["url"],
vec![SchemaValue::Url("https://example.com/product".into())]
);
}
#[test]
fn resource_on_nested_property() {
let html = r#"<html><body>
<div vocab="https://schema.org/" typeof="Product">
<span property="name">Widget</span>
<div property="offers" typeof="Offer" resource="https://example.com/offer/1">
<span property="priceCurrency">USD</span>
</div>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
let offers = &out.nodes[0].properties["offers"];
if let SchemaValue::Node(offer) = &offers[0] {
assert_eq!(
offer.properties["@id"],
vec![SchemaValue::Url("https://example.com/offer/1".into())]
);
} else {
panic!("Expected nested Offer node");
}
}
#[test]
fn nested_prefix_declarations() {
let html = r#"<html prefix="schema: https://schema.org/"><body>
<div prefix="og: https://ogp.me/ns#" vocab="https://schema.org/" typeof="Product">
<span property="name">Widget</span>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(out.nodes.len(), 1);
assert_eq!(out.nodes[0].types, vec!["Product"]);
}
#[test]
fn independent_typeof_nested_in_typed_node() {
let html = r#"<html><body>
<div vocab="https://schema.org/" typeof="WebPage">
<span property="name">My Page</span>
<div typeof="Organization">
<span property="name">Acme Corp</span>
</div>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert!(out
.nodes
.iter()
.any(|n| n.types.contains(&"WebPage".to_string())));
}
#[test]
fn time_element_without_datetime() {
let html = r#"<html><body>
<div vocab="https://schema.org/" typeof="Event">
<span property="name">Concert</span>
<time property="startDate">June 15, 2024</time>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(
out.nodes[0].properties["startDate"],
vec![SchemaValue::Text("June 15, 2024".into())]
);
}
#[test]
fn unicode_preserved_in_values() {
let html = r#"<html><body>
<div vocab="https://schema.org/" typeof="Product">
<span property="name">Gerät für Ökologie</span>
</div>
</body></html>"#;
let out = RdfaLiteExtractor.extract(html).expect("extraction failed");
assert_eq!(out.nodes.len(), 1);
assert_eq!(
out.nodes[0].properties["name"],
vec![SchemaValue::Text("Gerät für Ökologie".into())]
);
}
}