#[cfg(feature = "metaformats")]
use crate::types::{temporal, Class, Item, Properties, PropertyValue, TextValue, UrlValue};
#[cfg(feature = "metaformats")]
use std::str::FromStr;
#[cfg(feature = "metaformats")]
use url::Url;
#[cfg(feature = "metaformats")]
#[derive(Debug, Clone)]
pub struct MetaTag {
pub name: Option<String>,
pub property: Option<String>,
pub content: String,
}
#[cfg(feature = "metaformats")]
pub fn collect_meta_tags(head: &swc_html_ast::Element) -> Vec<MetaTag> {
let mut meta_tags = Vec::new();
fn traverse_element(element: &swc_html_ast::Element, meta_tags: &mut Vec<MetaTag>) {
if element.tag_name == "meta" {
let name = element
.attributes
.iter()
.find(|attr| attr.name == "name")
.and_then(|attr| attr.value.as_ref().map(|v| v.to_string()));
let property = element
.attributes
.iter()
.find(|attr| attr.name == "property")
.and_then(|attr| attr.value.as_ref().map(|v| v.to_string()));
let content = element
.attributes
.iter()
.find(|attr| attr.name == "content")
.and_then(|attr| attr.value.as_ref().map(|v| v.to_string()));
if let Some(content) = content {
meta_tags.push(MetaTag {
name,
property,
content,
});
}
}
for child in &element.children {
if let swc_html_ast::Child::Element(child_element) = child {
traverse_element(child_element, meta_tags);
}
}
}
traverse_element(head, &mut meta_tags);
meta_tags
}
#[cfg(feature = "metaformats")]
pub fn determine_item_type(meta_tags: &[MetaTag], is_home_page: bool) -> Vec<Class> {
for tag in meta_tags {
if let Some(property) = &tag.property {
if property == "og:type" {
match tag.content.as_str() {
"article" => {
if is_home_page {
return vec![Class::Known(microformats_types::KnownClass::Card)];
} else {
return vec![Class::Known(microformats_types::KnownClass::Entry)];
}
}
"profile" => return vec![Class::Known(microformats_types::KnownClass::Card)],
"music" | "video" => {
return vec![Class::Known(microformats_types::KnownClass::Cite)]
}
_ => {} }
}
}
}
for tag in meta_tags {
if let Some(name) = &tag.name {
if name == "twitter:card"
&& matches!(tag.content.as_str(), "summary" | "summary_large_image")
{
if is_home_page {
return vec![Class::Known(microformats_types::KnownClass::Card)];
} else {
return vec![Class::Known(microformats_types::KnownClass::Entry)];
}
}
}
}
let has_og_tags = meta_tags
.iter()
.any(|tag| tag.property.as_ref().is_some_and(|p| p.starts_with("og:")));
let has_twitter_tags = meta_tags
.iter()
.any(|tag| tag.name.as_ref().is_some_and(|n| n.starts_with("twitter:")));
if has_og_tags || has_twitter_tags {
if is_home_page {
return vec![Class::Known(microformats_types::KnownClass::Card)];
} else {
return vec![Class::Known(microformats_types::KnownClass::Entry)];
}
}
vec![Class::Known(microformats_types::KnownClass::Entry)]
}
#[cfg(feature = "metaformats")]
pub fn extract_properties(
meta_tags: &[MetaTag],
base_url: &Url,
item_types: &[microformats_types::Class],
) -> Properties {
let mut properties = Properties::new();
let _is_entry = item_types.iter().any(|class| {
matches!(
class,
microformats_types::Class::Known(microformats_types::KnownClass::Entry)
)
});
for tag in meta_tags {
let content = &tag.content;
if content.is_empty() {
continue;
}
if let Some(property) = &tag.property {
match property.as_str() {
"og:title" => {
properties.insert(
"name".to_string(),
vec![PropertyValue::Plain(TextValue::new(content.clone()))],
);
}
"og:description" => {
properties.insert(
"summary".to_string(),
vec![PropertyValue::Plain(TextValue::new(content.clone()))],
);
}
"og:image" => {
if let Ok(url) = base_url.join(content) {
properties.insert(
"photo".to_string(),
vec![PropertyValue::Url(UrlValue::new(url))],
);
}
}
"og:video" => {
if let Ok(url) = base_url.join(content) {
properties.insert(
"video".to_string(),
vec![PropertyValue::Url(UrlValue::new(url))],
);
}
}
"og:audio" => {
if let Ok(url) = base_url.join(content) {
properties.insert(
"audio".to_string(),
vec![PropertyValue::Url(UrlValue::new(url))],
);
}
}
"article:published_time" => {
if let Ok(temporal_value) = temporal::Value::from_str(content) {
properties.insert(
"published".to_string(),
vec![PropertyValue::Temporal(temporal_value)],
);
}
}
"article:modified_time" => {
if let Ok(temporal_value) = temporal::Value::from_str(content) {
properties.insert(
"updated".to_string(),
vec![PropertyValue::Temporal(temporal_value)],
);
}
}
"article:author" => {
properties.insert(
"author".to_string(),
vec![PropertyValue::Plain(TextValue::new(content.clone()))],
);
}
_ => {}
}
}
if let Some(name) = &tag.name {
match name.as_str() {
"twitter:title" => {
if !properties.contains_key("name") {
properties.insert(
"name".to_string(),
vec![PropertyValue::Plain(TextValue::new(content.clone()))],
);
}
}
"twitter:description" => {
if !properties.contains_key("summary") {
properties.insert(
"summary".to_string(),
vec![PropertyValue::Plain(TextValue::new(content.clone()))],
);
}
}
"twitter:image" => {
if !properties.contains_key("photo") {
if let Ok(url) = base_url.join(content) {
properties.insert(
"photo".to_string(),
vec![PropertyValue::Url(UrlValue::new(url))],
);
}
}
}
_ => {}
}
}
}
properties
}
#[cfg(feature = "metaformats")]
pub fn parse_metaformats_from_head(
head: &swc_html_ast::Element,
base_url: &Url,
document_url: Option<&Url>,
) -> Option<Item> {
let meta_tags = collect_meta_tags(head);
let is_home_page =
document_url.is_some_and(|url| url.path() == "/" || url.path() == "/index.html");
let types = determine_item_type(&meta_tags, is_home_page);
let mut properties = extract_properties(&meta_tags, base_url, &types);
if !meta_tags.is_empty() {
apply_fallback_properties(head, &mut properties);
}
if !meta_tags.is_empty() && !properties.contains_key("photo") {
if let Some(icon_url) = find_icon_url(head, base_url) {
properties.insert(
"photo".to_string(),
vec![PropertyValue::Url(UrlValue::new(icon_url))],
);
}
}
if types.is_empty() || properties.is_empty() {
None
} else {
let mut item = Item::new(types);
item.properties = properties;
Some(item)
}
}
#[cfg(feature = "metaformats")]
fn apply_fallback_properties(head: &swc_html_ast::Element, properties: &mut Properties) {
if !properties.contains_key("name") {
if let Some(title_content) = find_title_content(head) {
properties.insert(
"name".to_string(),
vec![PropertyValue::Plain(TextValue::new(title_content))],
);
}
}
if !properties.contains_key("summary") {
if let Some(description_content) = find_meta_description_content(head) {
properties.insert(
"summary".to_string(),
vec![PropertyValue::Plain(TextValue::new(description_content))],
);
}
}
}
#[cfg(feature = "metaformats")]
fn find_title_content(head: &swc_html_ast::Element) -> Option<String> {
fn find_title_recursive(element: &swc_html_ast::Element) -> Option<String> {
if element.tag_name == "title" {
let mut text = String::new();
for child in &element.children {
if let swc_html_ast::Child::Text(text_node) = child {
text.push_str(&text_node.data);
}
}
return Some(text.trim().to_string());
}
for child in &element.children {
if let swc_html_ast::Child::Element(child_element) = child {
if let Some(title) = find_title_recursive(child_element) {
return Some(title);
}
}
}
None
}
find_title_recursive(head)
}
#[cfg(feature = "metaformats")]
fn find_meta_description_content(head: &swc_html_ast::Element) -> Option<String> {
fn find_meta_recursive(element: &swc_html_ast::Element) -> Option<String> {
if element.tag_name == "meta" {
let mut is_description = false;
let mut content = None;
for attr in &element.attributes {
match attr.name.to_string().as_str() {
"name" => {
if let Some(value) = &attr.value {
if *value == "description" {
is_description = true;
}
}
}
"content" => {
if let Some(value) = &attr.value {
content = Some(value.to_string());
}
}
_ => {}
}
}
if is_description {
return content;
}
}
for child in &element.children {
if let swc_html_ast::Child::Element(child_element) = child {
if let Some(description) = find_meta_recursive(child_element) {
return Some(description);
}
}
}
None
}
find_meta_recursive(head)
}
#[cfg(feature = "metaformats")]
fn find_icon_url(head: &swc_html_ast::Element, base_url: &Url) -> Option<Url> {
fn find_link_recursive(element: &swc_html_ast::Element, base_url: &Url) -> Option<Url> {
if element.tag_name == "link" {
let mut is_icon = false;
let mut href = None;
for attr in &element.attributes {
match attr.name.to_string().as_str() {
"rel" => {
if let Some(value) = &attr.value {
let rel_value = value.to_string();
if rel_value == "icon"
|| rel_value == "shortcut icon"
|| rel_value == "apple-touch-icon"
|| rel_value == "msapplication-TileImage"
{
is_icon = true;
}
}
}
"href" => {
if let Some(value) = &attr.value {
href = Some(value.to_string());
}
}
_ => {}
}
}
if is_icon {
if let Some(href) = href {
return base_url.join(&href).ok();
}
}
}
for child in &element.children {
if let swc_html_ast::Child::Element(child_element) = child {
if let Some(icon_url) = find_link_recursive(child_element, base_url) {
return Some(icon_url);
}
}
}
None
}
find_link_recursive(head, base_url)
}
#[cfg(test)]
mod tests {
use super::*;
#[cfg(feature = "metaformats")]
#[test]
fn test_determine_item_type_home_page() {
let meta_tags = vec![MetaTag {
name: None,
property: Some("og:type".to_string()),
content: "article".to_string(),
}];
let types = determine_item_type(&meta_tags, true);
assert_eq!(types.len(), 1);
match &types[0] {
Class::Known(known) => assert_eq!(known.to_string(), "h-card"),
_ => panic!("Expected known class"),
}
}
#[cfg(feature = "metaformats")]
#[test]
fn test_determine_item_type_regular_page() {
let meta_tags = vec![MetaTag {
name: None,
property: Some("og:type".to_string()),
content: "article".to_string(),
}];
let types = determine_item_type(&meta_tags, false);
assert_eq!(types.len(), 1);
match &types[0] {
Class::Known(known) => assert_eq!(known.to_string(), "h-entry"),
_ => panic!("Expected known class"),
}
}
#[cfg(all(test, feature = "metaformats"))]
#[test]
fn test_metaformats_basic_og_tags() -> Result<(), crate::Error> {
use crate::parse::Parser;
let html = r#"
<!DOCTYPE html>
<html>
<head>
<meta property="og:title" content="Test Title">
<meta property="og:description" content="Test Description">
<meta property="og:type" content="article">
</head>
<body></body>
</html>
"#;
let doc = Parser::from_html(html.to_string())?
.into_document(Some("https://example.com/page.html".parse()?))?;
assert!(doc.meta_item.is_some());
let meta_item = doc.meta_item.as_ref().unwrap();
assert_eq!(meta_item.r#type.len(), 1);
assert_eq!(meta_item.r#type[0].to_string(), "h-entry");
let name_values = meta_item.properties.get("name").unwrap();
assert_eq!(name_values.len(), 1);
assert_eq!(
name_values[0],
PropertyValue::Plain(TextValue::new("Test Title".to_string()))
);
let summary_values = meta_item.properties.get("summary").unwrap();
assert_eq!(summary_values.len(), 1);
assert_eq!(
summary_values[0],
PropertyValue::Plain(TextValue::new("Test Description".to_string()))
);
Ok(())
}
#[cfg(all(test, feature = "metaformats"))]
#[test]
fn test_metaformats_u_photo_for_entry() -> Result<(), crate::Error> {
use crate::parse::Parser;
let html = r#"
<!DOCTYPE html>
<html>
<head>
<meta property="og:title" content="Test Title">
<meta property="og:image" content="https://example.com/image.jpg">
<meta property="og:type" content="article">
</head>
<body></body>
</html>
"#;
let doc = Parser::from_html(html.to_string())?
.into_document(Some("https://example.com/page.html".parse()?))?;
assert!(doc.meta_item.is_some());
let meta_item = doc.meta_item.as_ref().unwrap();
let photo_values = meta_item.properties.get("photo").unwrap();
assert_eq!(photo_values.len(), 1);
assert!(matches!(photo_values[0], PropertyValue::Url(_)));
assert!(meta_item.properties.get("featured").is_none());
Ok(())
}
#[cfg(all(test, feature = "metaformats"))]
#[test]
fn test_metaformats_title_fallback() -> Result<(), crate::Error> {
use crate::parse::Parser;
let html = r#"
<!DOCTYPE html>
<html>
<head>
<title>Fallback Title</title>
<meta name="twitter:card" content="summary">
</head>
<body></body>
</html>
"#;
let doc = Parser::from_html(html.to_string())?
.into_document(Some("https://example.com/page.html".parse()?))?;
assert!(doc.meta_item.is_some());
let meta_item = doc.meta_item.as_ref().unwrap();
let name_values = meta_item.properties.get("name").unwrap();
assert_eq!(name_values.len(), 1);
assert_eq!(
name_values[0],
PropertyValue::Plain(TextValue::new("Fallback Title".to_string()))
);
Ok(())
}
#[cfg(all(test, feature = "metaformats"))]
#[test]
fn test_metaformats_icon_fallback() -> Result<(), crate::Error> {
use crate::parse::Parser;
let html = r#"
<!DOCTYPE html>
<html>
<head>
<title>Site Title</title>
<link rel="icon" href="/favicon.ico">
<meta name="twitter:card" content="summary">
</head>
<body></body>
</html>
"#;
let doc = Parser::from_html(html.to_string())?
.into_document(Some("https://example.com/page.html".parse()?))?;
assert!(doc.meta_item.is_some());
let meta_item = doc.meta_item.as_ref().unwrap();
let photo_values = meta_item.properties.get("photo").unwrap();
assert_eq!(photo_values.len(), 1);
assert!(matches!(photo_values[0], PropertyValue::Url(_)));
Ok(())
}
}