use crate::errors::Result;
use crate::extractors::common::{html_utils, url_utils};
use crate::types::microdata::MicrodataItem;
use scraper::{ElementRef, Selector};
#[cfg(test)]
mod tests;
pub fn extract(html: &str, base_url: Option<&str>) -> Result<Vec<MicrodataItem>> {
let document = html_utils::parse_html(html);
let mut items = Vec::new();
let itemscope_selector = Selector::parse("[itemscope]").unwrap();
for element in document.select(&itemscope_selector) {
if !is_top_level_itemscope(&element) {
continue;
}
if let Ok(item) = extract_item(&element, base_url) {
items.push(item);
}
}
Ok(items)
}
fn is_top_level_itemscope(element: &ElementRef) -> bool {
if element.value().attr("itemprop").is_some() {
let mut parent = element.parent();
while let Some(node) = parent {
if let Some(parent_element) = ElementRef::wrap(node) {
if parent_element.value().attr("itemscope").is_some() {
return false;
}
}
parent = node.parent();
}
}
true
}
fn extract_item(element: &ElementRef, base_url: Option<&str>) -> Result<MicrodataItem> {
let mut item = MicrodataItem::new();
if let Some(itemtype_str) = element.value().attr("itemtype") {
let types: Vec<String> = itemtype_str.split_whitespace().map(|s| s.to_string()).collect();
if !types.is_empty() {
item.item_type = Some(types);
}
}
if let Some(itemid) = element.value().attr("itemid") {
item.id = Some(itemid.to_string());
}
extract_properties(element, &mut item, base_url)?;
Ok(item)
}
fn extract_properties(
scope: &ElementRef,
item: &mut MicrodataItem,
base_url: Option<&str>,
) -> Result<()> {
for descendant in scope.descendants() {
if let Some(element) = ElementRef::wrap(descendant) {
if let Some(prop_name) = element.value().attr("itemprop") {
if belongs_to_scope(scope, &element) {
if element.value().attr("itemscope").is_some() {
if let Ok(nested_item) = extract_item(&element, base_url) {
item.add_item_property(prop_name.to_string(), nested_item);
}
} else {
if let Some(value) = extract_property_value(&element, base_url) {
item.add_text_property(prop_name.to_string(), value);
}
}
}
}
}
}
Ok(())
}
fn belongs_to_scope(scope: &ElementRef, prop_element: &ElementRef) -> bool {
let scope_id = scope.id();
let mut current = prop_element.parent();
while let Some(node) = current {
if let Some(element) = ElementRef::wrap(node) {
if element.id() == scope_id {
return true;
}
if element.value().attr("itemscope").is_some() && element.id() != scope_id {
return false;
}
}
current = node.parent();
}
false
}
fn extract_property_value(element: &ElementRef, base_url: Option<&str>) -> Option<String> {
let tag_name = element.value().name();
let value = match tag_name {
"meta" => element.value().attr("content").map(|s| s.to_string()),
"link" => element.value().attr("href").map(|s| s.to_string()),
"a" | "area" => element.value().attr("href").map(|s| s.to_string()),
"audio" | "embed" | "iframe" | "img" | "source" | "track" | "video" => {
element.value().attr("src").map(|s| s.to_string())
}
"object" => element.value().attr("data").map(|s| s.to_string()),
"data" => element.value().attr("value").map(|s| s.to_string()),
"meter" => element.value().attr("value").map(|s| s.to_string()),
"time" => {
if let Some(datetime) = element.value().attr("datetime") {
Some(datetime.to_string())
} else {
let text: String = element.text().collect();
if !text.trim().is_empty() {
Some(text)
} else {
None
}
}
}
_ => {
let text: String = element.text().collect();
Some(text)
}
}?;
if is_url_property(tag_name, element) {
if let Ok(resolved) = url_utils::resolve_url(base_url, &value) {
return Some(resolved);
}
}
Some(value)
}
fn is_url_property(tag_name: &str, element: &ElementRef) -> bool {
matches!(tag_name, "a" | "area" | "link")
|| (matches!(tag_name, "audio" | "embed" | "iframe" | "img" | "source" | "track" | "video")
&& element.value().attr("src").is_some())
|| (tag_name == "object" && element.value().attr("data").is_some())
}
#[cfg(test)]
mod unit_tests {
use super::*;
#[test]
fn test_is_url_property() {
let html = html_utils::parse_html(r#"<a href="test">Link</a>"#);
let selector = Selector::parse("a").unwrap();
let element = html.select(&selector).next().unwrap();
assert!(is_url_property("a", &element));
assert!(!is_url_property("span", &element));
}
#[test]
fn test_extract_property_value_from_text() {
let html = html_utils::parse_html(r#"<span>Test Value</span>"#);
let selector = Selector::parse("span").unwrap();
let element = html.select(&selector).next().unwrap();
let value = extract_property_value(&element, None);
assert_eq!(value, Some("Test Value".to_string()));
}
#[test]
fn test_extract_property_value_from_meta() {
let html = html_utils::parse_html(r#"<meta content="test">"#);
let selector = Selector::parse("meta").unwrap();
let element = html.select(&selector).next().unwrap();
let value = extract_property_value(&element, None);
assert_eq!(value, Some("test".to_string()));
}
#[test]
fn test_extract_property_value_from_link() {
let html = html_utils::parse_html(r#"<link href="https://example.com">"#);
let selector = Selector::parse("link").unwrap();
let element = html.select(&selector).next().unwrap();
let value = extract_property_value(&element, None);
assert_eq!(value, Some("https://example.com".to_string()));
}
}