use microformats_types::{Class, Item, Properties, PropertyValue};
use url::Url;
#[derive(Debug, Clone)]
pub struct MetaTag {
pub name: Option<String>,
pub property: Option<String>,
pub content: String,
}
pub fn collect_meta_tags(head: &swc_html_ast::Element) -> Vec<MetaTag> {
let mut meta_tags = Vec::new();
fn traverse_element(element: &swc_html_ast::Element, meta_tags: &mut Vec<MetaTag>) {
if element.tag_name.to_string() == "meta" {
let name = element.attributes.iter()
.find(|attr| attr.name.to_string() == "name")
.and_then(|attr| attr.value.as_ref().map(|v| v.to_string()));
let property = element.attributes.iter()
.find(|attr| attr.name.to_string() == "property")
.and_then(|attr| attr.value.as_ref().map(|v| v.to_string()));
let content = element.attributes.iter()
.find(|attr| attr.name.to_string() == "content")
.and_then(|attr| attr.value.as_ref().map(|v| v.to_string()));
if let Some(content) = content {
meta_tags.push(MetaTag {
name,
property,
content,
});
}
}
for child in &element.children {
if let swc_html_ast::Child::Element(child_element) = child {
traverse_element(child_element, meta_tags);
}
}
}
traverse_element(head, &mut meta_tags);
meta_tags
}
pub fn determine_item_type(meta_tags: &[MetaTag], is_home_page: bool) -> Vec<Class> {
for tag in meta_tags {
if let Some(property) = &tag.property {
if property == "og:type" {
match tag.content.as_str() {
"article" => {
if is_home_page {
return vec![Class::Known(microformats_types::KnownClass::Card)];
} else {
return vec![Class::Known(microformats_types::KnownClass::Entry)];
}
}
"profile" => return vec![Class::Known(microformats_types::KnownClass::Card)],
"music" | "video" => return vec![Class::Known(microformats_types::KnownClass::Cite)],
_ => {} }
}
}
}
for tag in meta_tags {
if let Some(name) = &tag.name {
if name == "twitter:card" {
if matches!(tag.content.as_str(), "summary" | "summary_large_image") {
if is_home_page {
return vec![Class::Known(microformats_types::KnownClass::Card)];
} else {
return vec![Class::Known(microformats_types::KnownClass::Entry)];
}
}
}
}
}
let has_og_tags = meta_tags.iter().any(|tag| tag.property.as_ref().map_or(false, |p| p.starts_with("og:")));
let has_twitter_tags = meta_tags.iter().any(|tag| tag.name.as_ref().map_or(false, |n| n.starts_with("twitter:")));
if has_og_tags || has_twitter_tags {
if is_home_page {
return vec![Class::Known(microformats_types::KnownClass::Card)];
} else {
return vec![Class::Known(microformats_types::KnownClass::Entry)];
}
}
vec![Class::Known(microformats_types::KnownClass::Entry)]
}
pub fn extract_properties(meta_tags: &[MetaTag], base_url: &Url, item_types: &[microformats_types::Class]) -> Properties {
let mut properties = Properties::new();
let is_entry = item_types.iter().any(|class| {
matches!(class, microformats_types::Class::Known(microformats_types::KnownClass::Entry))
});
for tag in meta_tags {
let content = &tag.content;
if content.is_empty() {
continue;
}
if let Some(property) = &tag.property {
match property.as_str() {
"og:title" => {
properties.insert("name".to_string(), vec![PropertyValue::Plain(content.clone())]);
}
"og:description" => {
properties.insert("summary".to_string(), vec![PropertyValue::Plain(content.clone())]);
}
"og:image" => {
if let Ok(url) = base_url.join(content) {
let property_name = if is_entry { "featured" } else { "photo" };
properties.insert(property_name.to_string(), vec![PropertyValue::Url(url)]);
}
}
"og:video" => {
if let Ok(url) = base_url.join(content) {
properties.insert("video".to_string(), vec![PropertyValue::Url(url)]);
}
}
"og:audio" => {
if let Ok(url) = base_url.join(content) {
properties.insert("audio".to_string(), vec![PropertyValue::Url(url)]);
}
}
"article:published_time" => {
properties.insert("published".to_string(), vec![PropertyValue::Plain(content.clone())]);
}
"article:modified_time" => {
properties.insert("updated".to_string(), vec![PropertyValue::Plain(content.clone())]);
}
"article:author" => {
properties.insert("author".to_string(), vec![PropertyValue::Plain(content.clone())]);
}
_ => {}
}
}
if let Some(name) = &tag.name {
match name.as_str() {
"twitter:title" => {
if !properties.contains_key("name") {
properties.insert("name".to_string(), vec![PropertyValue::Plain(content.clone())]);
}
}
"twitter:description" => {
if !properties.contains_key("summary") {
properties.insert("summary".to_string(), vec![PropertyValue::Plain(content.clone())]);
}
}
"twitter:image" => {
if !properties.contains_key("photo") && !properties.contains_key("featured") {
if let Ok(url) = base_url.join(content) {
let property_name = if is_entry { "featured" } else { "photo" };
properties.insert(property_name.to_string(), vec![PropertyValue::Url(url)]);
}
}
}
_ => {}
}
}
}
properties
}
pub fn parse_metaformats_from_head(head: &swc_html_ast::Element, base_url: &Url, document_url: Option<&Url>) -> Option<Item> {
let meta_tags = collect_meta_tags(head);
let is_home_page = document_url.map_or(false, |url| {
url.path() == "/" || url.path() == "/index.html"
});
let types = determine_item_type(&meta_tags, is_home_page);
let mut properties = extract_properties(&meta_tags, base_url, &types);
if !types.is_empty() {
apply_fallback_properties(head, &mut properties);
}
if !properties.contains_key("photo") && !properties.contains_key("featured") {
if let Some(icon_url) = find_icon_url(head, base_url) {
properties.insert("photo".to_string(), vec![PropertyValue::Url(icon_url)]);
}
}
if types.is_empty() && properties.is_empty() {
None
} else {
let mut item = Item::new(types);
item.properties = properties;
Some(item)
}
}
fn apply_fallback_properties(head: &swc_html_ast::Element, properties: &mut Properties) {
if !properties.contains_key("name") {
if let Some(title_content) = find_title_content(head) {
properties.insert("name".to_string(), vec![PropertyValue::Plain(title_content)]);
}
}
if !properties.contains_key("summary") {
if let Some(description_content) = find_meta_description_content(head) {
properties.insert("summary".to_string(), vec![PropertyValue::Plain(description_content)]);
}
}
}
fn find_title_content(head: &swc_html_ast::Element) -> Option<String> {
fn find_title_recursive(element: &swc_html_ast::Element) -> Option<String> {
if element.tag_name.to_string() == "title" {
let mut text = String::new();
for child in &element.children {
if let swc_html_ast::Child::Text(text_node) = child {
text.push_str(&text_node.data);
}
}
return Some(text.trim().to_string());
}
for child in &element.children {
if let swc_html_ast::Child::Element(child_element) = child {
if let Some(title) = find_title_recursive(child_element) {
return Some(title);
}
}
}
None
}
find_title_recursive(head)
}
fn find_meta_description_content(head: &swc_html_ast::Element) -> Option<String> {
fn find_meta_recursive(element: &swc_html_ast::Element) -> Option<String> {
if element.tag_name.to_string() == "meta" {
let mut is_description = false;
let mut content = None;
for attr in &element.attributes {
match attr.name.to_string().as_str() {
"name" => {
if let Some(value) = &attr.value {
if value.to_string() == "description" {
is_description = true;
}
}
}
"content" => {
if let Some(value) = &attr.value {
content = Some(value.to_string());
}
}
_ => {}
}
}
if is_description {
return content;
}
}
for child in &element.children {
if let swc_html_ast::Child::Element(child_element) = child {
if let Some(description) = find_meta_recursive(child_element) {
return Some(description);
}
}
}
None
}
find_meta_recursive(head)
}
fn find_icon_url(head: &swc_html_ast::Element, base_url: &Url) -> Option<Url> {
fn find_link_recursive(element: &swc_html_ast::Element, base_url: &Url) -> Option<Url> {
if element.tag_name.to_string() == "link" {
let mut is_icon = false;
let mut href = None;
for attr in &element.attributes {
match attr.name.to_string().as_str() {
"rel" => {
if let Some(value) = &attr.value {
let rel_value = value.to_string();
if rel_value == "icon" ||
rel_value == "apple-touch-icon" ||
rel_value == "msapplication-TileImage" {
is_icon = true;
}
}
}
"href" => {
if let Some(value) = &attr.value {
href = Some(value.to_string());
}
}
_ => {}
}
}
if is_icon {
if let Some(href) = href {
return base_url.join(&href).ok();
}
}
}
for child in &element.children {
if let swc_html_ast::Child::Element(child_element) = child {
if let Some(icon_url) = find_link_recursive(child_element, base_url) {
return Some(icon_url);
}
}
}
None
}
find_link_recursive(head, base_url)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_determine_item_type_home_page() {
let meta_tags = vec![
MetaTag {
name: None,
property: Some("og:type".to_string()),
content: "article".to_string(),
}
];
let types = determine_item_type(&meta_tags, true);
assert_eq!(types.len(), 1);
match &types[0] {
Class::Known(known) => assert_eq!(known.to_string(), "h-card"),
_ => panic!("Expected known class"),
}
}
#[test]
fn test_determine_item_type_regular_page() {
let meta_tags = vec![
MetaTag {
name: None,
property: Some("og:type".to_string()),
content: "article".to_string(),
}
];
let types = determine_item_type(&meta_tags, false);
assert_eq!(types.len(), 1);
match &types[0] {
Class::Known(known) => assert_eq!(known.to_string(), "h-entry"),
_ => panic!("Expected known class"),
}
}
#[cfg(all(test, feature = "metaformats"))]
#[test]
fn test_metaformats_basic_og_tags() -> Result<(), crate::Error> {
use crate::parse::Parser;
let html = r#"
<!DOCTYPE html>
<html>
<head>
<meta property="og:title" content="Test Title">
<meta property="og:description" content="Test Description">
<meta property="og:type" content="article">
</head>
<body></body>
</html>
"#;
let doc = Parser::from_html(html.to_string())?.into_document(Some("https://example.com".parse()?))?;
assert!(doc.meta_item.is_some());
let meta_item = doc.meta_item.as_ref().unwrap();
assert_eq!(meta_item.r#type.len(), 1);
assert_eq!(meta_item.r#type[0].to_string(), "h-entry");
let name_values = meta_item.properties.get("name").unwrap();
assert_eq!(name_values.len(), 1);
assert_eq!(name_values[0], PropertyValue::Plain("Test Title".to_string()));
let summary_values = meta_item.properties.get("summary").unwrap();
assert_eq!(summary_values.len(), 1);
assert_eq!(summary_values[0], PropertyValue::Plain("Test Description".to_string()));
Ok(())
}
#[cfg(all(test, feature = "metaformats"))]
#[test]
fn test_metaformats_u_featured_for_entry() -> Result<(), crate::Error> {
use crate::parse::Parser;
let html = r#"
<!DOCTYPE html>
<html>
<head>
<meta property="og:title" content="Test Title">
<meta property="og:image" content="https://example.com/image.jpg">
<meta property="og:type" content="article">
</head>
<body></body>
</html>
"#;
let doc = Parser::from_html(html.to_string())?.into_document(Some("https://example.com".parse()?))?;
assert!(doc.meta_item.is_some());
let meta_item = doc.meta_item.as_ref().unwrap();
let featured_values = meta_item.properties.get("featured").unwrap();
assert_eq!(featured_values.len(), 1);
assert!(matches!(featured_values[0], PropertyValue::Url(_)));
assert!(meta_item.properties.get("photo").is_none());
Ok(())
}
#[cfg(all(test, feature = "metaformats"))]
#[test]
fn test_metaformats_title_fallback() -> Result<(), crate::Error> {
use crate::parse::Parser;
let html = r#"
<!DOCTYPE html>
<html>
<head>
<title>Fallback Title</title>
<meta name="twitter:card" content="summary">
</head>
<body></body>
</html>
"#;
let doc = Parser::from_html(html.to_string())?.into_document(Some("https://example.com".parse()?))?;
assert!(doc.meta_item.is_some());
let meta_item = doc.meta_item.as_ref().unwrap();
let name_values = meta_item.properties.get("name").unwrap();
assert_eq!(name_values.len(), 1);
assert_eq!(name_values[0], PropertyValue::Plain("Fallback Title".to_string()));
Ok(())
}
#[cfg(all(test, feature = "metaformats"))]
#[test]
fn test_metaformats_icon_fallback() -> Result<(), crate::Error> {
use crate::parse::Parser;
let html = r#"
<!DOCTYPE html>
<html>
<head>
<title>Site Title</title>
<link rel="icon" href="/favicon.ico">
<meta name="twitter:card" content="summary">
</head>
<body></body>
</html>
"#;
let doc = Parser::from_html(html.to_string())?.into_document(Some("https://example.com".parse()?))?;
assert!(doc.meta_item.is_some());
let meta_item = doc.meta_item.as_ref().unwrap();
let photo_values = meta_item.properties.get("photo").unwrap();
assert_eq!(photo_values.len(), 1);
assert!(matches!(photo_values[0], PropertyValue::Url(_)));
Ok(())
}
}