microformats 0.18.2

A union library of the Microformats types and associated parser.
Documentation
use std::str::FromStr;

use crate::parse::{
    element::{Extraction, HtmlUrlExpander},
    remove_surrounding_whitespace,
};

#[cfg(feature = "picture")]
use crate::parse::picture::PictureParser;

use super::*;
use html_escape::decode_html_entities;
pub(crate) struct PropertyParser {
    pub(crate) elem: ElementPtr,
    kind: DeclKind,
    base_url: Url,
    parent: Option<ElementPtr>,
}

impl std::fmt::Debug for PropertyParser {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> swc_html_codegen::Result {
        f.debug_struct("PropertyParser")
            .field("forKind", &self.kind)
            .finish()
    }
}

impl PropertyParser {
    #[tracing::instrument(level = "trace", ret)]
    pub(crate) fn new(
        elem: ElementPtr,
        kind: DeclKind,
        base_url: Url,
        parent: Option<ElementPtr>,
    ) -> Self {
        Self {
            elem,
            kind,
            base_url,
            parent,
        }
    }

    /// Checks if this property is within an h-review microformat context.
    ///
    /// This is used to implement backward compatibility mappings for h-review,
    /// specifically the normalization of e-description (legacy) to e-content (current spec).
    /// See: https://microformats.org/wiki/h-review
    fn is_in_h_review_context(&self) -> bool {
        if let Some(parent) = &self.parent {
            return parent.node.root_classes().iter().any(|class| {
                matches!(
                    class,
                    microformats_types::Class::Known(microformats_types::KnownClass::Review)
                )
            });
        }
        false
    }
    #[tracing::instrument(level = "trace", skip(self), ret)]
    fn expand_plain_property_value(&self) -> Result<Option<PropertyValue>, crate::parse::Error> {
        let elem = &self.elem.node;
        // 1: Check for VCP first; use if found.
        Ok(value_class::ValueClassPropertyExtractor {
            element: Arc::clone(&self.elem),
            hint: value_class::TypeHint::Plain,
        }
        .extract_value_class(&self.base_url)?
        .filter(non_empty_property_value)
        // 2. If abbr.p-x[title] or link.p-x[title], return the title attribute.
        .or_else(|| {
            elem.attr("title")
                .filter(|_| ["abbr", "link"].contains(&elem.tag()))
                .filter(non_empty_string)
                .map(|s| PropertyValue::Plain(TextValue::new(s)))
        })
        // 3. else if data.p-x[value] or input.p-x[value], then return the value attribute
        .or_else(|| {
            elem.attr("value")
                .filter(|_| ["data"].contains(&elem.tag()))
                .filter(non_empty_string)
                .map(|s| PropertyValue::Plain(TextValue::new(s)))
        })
        // 4. else if img.p-x[alt] or area.p-x[alt], then return the alt attribute
        .or_else(|| {
            elem.attr("alt")
                .filter(|_| ["img", "area"].contains(&elem.tag()))
                .filter(non_empty_string)
                .map(|s| PropertyValue::Plain(TextValue::new(s)))
        })
        // 5. else if picture.p-x[alt], then return the alt attribute from fallback img
        .or_else(|| {
            #[cfg(feature = "picture")]
            {
                if elem.tag() == "picture" {
                    let picture_parser = PictureParser::new();
                    if let Ok(Some(PropertyValue::Image(image))) =
                        picture_parser.parse_image_element(elem, &self.base_url)
                    {
                        return image
                            .alt
                            .map(|alt| PropertyValue::Plain(TextValue::new(alt)));
                    }
                }
            }
            None
        })
        // 6. else return the textContent of the element
        .or_else(|| {
            elem.text_content_with_img_links(&self.base_url)
                .ok()
                .map(|Extraction { text, .. }| PropertyValue::Plain(TextValue::new(text)))
        }))
        .map(|value_opt| {
            value_opt.map(|value| {
                if let PropertyValue::Plain(plain_text_value) = value {
                    PropertyValue::Plain(TextValue::new(remove_surrounding_whitespace(
                        plain_text_value.to_string(),
                    )))
                } else {
                    value
                }
            })
        })
    }

    fn expand_linked_property_value(&self) -> Option<PropertyValue> {
        let node = &self.elem.node;

        // 1. if a.u-x[href] or area.u-x[href] or link.u-x[href], then get the href attribute
        let text_value = node
            .attr("href")
            .filter(|_| ["a", "area", "link"].contains(&node.tag()))
            .or_else(|| {
                // 3. else if audio.u-x[src] or video.u-x[src] or source.u-x[src] or iframe.u-x[src], then get the src attribute
                node.attr("src")
                    .filter(|_| ["audio", "video", "source", "iframe"].contains(&node.tag()))
            })
            .or_else(|| {
                // 4. else if video.u-x[poster], then get the poster attribute
                node.attr("poster")
                    .filter(non_empty_string)
                    .filter(|_| ["video"].contains(&node.tag()))
            })
            .or_else(|| {
                // 5. else if object.u-x[data], then get the data attribute
                node.attr("data")
                    .filter(non_empty_string)
                    .filter(|_| ["object"].contains(&node.tag()))
            })
            .or_else(|| {
                // 6. else parse the element for the value-class-pattern. If a value is found, get it
                // TODO: Refactor this to properly throw an exception.
                if let Some(PropertyValue::Plain(value)) =
                    (value_class::ValueClassPropertyExtractor {
                        element: Arc::clone(&self.elem),
                        hint: value_class::TypeHint::Plain,
                    }
                    .extract_value_class(&self.base_url)
                    .ok()
                    .flatten()
                    .filter(non_empty_property_value))
                {
                    Some(value.to_string())
                } else {
                    None
                }
            })
            .or_else(|| {
                // 7. else if abbr.u-x[title], then get the title attribute
                node.attr("title")
                    .filter(non_empty_string)
                    .filter(|_| ["abbr"].contains(&node.tag()))
            })
            .or_else(|| {
                // 8. else if data.u-x[value] or input.u-x[value], then get the value attribute
                node.attr("value")
                    .filter(non_empty_string)
                    .filter(|_| ["input", "data"].contains(&node.tag()))
            })
            .or_else(|| {
                // 9. else get the textContent of the element after removing all leading/trailing spaces and nested <script> & <style> elements
                node.text_content(&self.base_url).ok().map(Into::into)
            });

        let linked_text = text_value.unwrap_or_default();

        if node.tag() == "picture" {
            // Picture element processing - delegate to picture parser
            #[cfg(feature = "picture")]
            {
                let picture_parser = PictureParser::new();
                if let Ok(Some(property_value)) =
                    picture_parser.parse_image_element(node, &self.base_url)
                {
                    Some(property_value)
                } else {
                    // Fallback to text content if picture parsing fails
                    if let Ok(u) = self.base_url.join(&linked_text) {
                        Some(PropertyValue::Url(UrlValue::new(u)))
                    } else {
                        Some(PropertyValue::Plain(TextValue::new(linked_text)))
                    }
                }
            }
            #[cfg(not(feature = "picture"))]
            {
                // Fallback to text content when picture feature is not enabled
                if let Ok(u) = self.base_url.join(&linked_text) {
                    Some(PropertyValue::Url(UrlValue::new(u)))
                } else {
                    Some(PropertyValue::Plain(TextValue::new(linked_text)))
                }
            }
        } else if node.tag() == "img" {
            // 2. image processing
            extract_img_element(node, &self.base_url)
        } else if let Ok(u) = self.base_url.join(&linked_text) {
            Some(PropertyValue::Url(UrlValue::new(u)))
        } else {
            Some(PropertyValue::Plain(TextValue::new(linked_text)))
        }
    }

    #[tracing::instrument(level = "trace", skip(self), ret)]
    fn expand_temporal_property_value(&self) -> Result<Option<PropertyValue>, crate::parse::Error> {
        let elem = &self.elem.node;

        // 1. parse the element for the value-class-pattern, including the date and time parsing rules. If a value is found, then return it.
        let vcp_value = value_class::ValueClassPropertyExtractor {
            element: Arc::clone(&self.elem),
            hint: value_class::TypeHint::Temporal,
        }
        .extract_value_class(&self.base_url)?;

        if vcp_value.is_none() {
            // 2. if time.dt-x[datetime] or ins.dt-x[datetime] or del.dt-x[datetime], then return the datetime attribute
            let text_value = elem
                .attr("datetime")
                .filter(|_| ["time", "ins", "del"].contains(&elem.tag()))
                .filter(non_empty_string)
                .or_else(|| {
                    // 3. else if abbr.dt-x[title], then return the title attribute
                    elem.attr("title")
                        .filter(|_| ["abbr"].contains(&elem.tag()))
                        .filter(non_empty_string)
                })
                .or_else(|| {
                    // 4. else if data.dt-x[value] or input.dt-x[value], then return the value attribute
                    elem.attr("value")
                        .filter(|_| ["data", "input"].contains(&elem.tag()))
                        .filter(non_empty_string)
                });
            let dt_str = if let Some(v) = text_value {
                v
            } else {
                // 5. else return the textContent of the element after removing all leading/trailing spaces and nested <script> & <style> elements.
                elem.text_content(&self.base_url).map(Into::into)?
            };
            Ok(
                if let Ok(value) = microformats_types::temporal::Value::from_str(&dt_str) {
                    Some(PropertyValue::Temporal(value)).filter(non_empty_property_value)
                } else {
                    Some(PropertyValue::Plain(TextValue::new(dt_str)))
                        .filter(non_empty_property_value)
                },
            )
        } else {
            Ok(vcp_value)
        }
    }

    #[tracing::instrument(level = "trace", skip(self), ret)]
    fn get_embedded_html(&self) -> Result<PropertyValue, crate::parse::Error> {
        let raw_html = self.elem.node.html_content()?;
        let html = HtmlUrlExpander::expand_urls_in_html_string(&raw_html, &self.base_url)?;
        let Extraction {
            text: raw_value,
            links,
        } = self.elem.node.text_content_with_img_links(&self.base_url)?;

        Ok(PropertyValue::Fragment(Fragment::new(
            html,
            decode_html_entities(&remove_surrounding_whitespace(raw_value)).to_string(),
            links,
            #[cfg(feature = "per_element_lang")]
            self.elem.node.attr("lang"),
        )))
    }

    #[tracing::instrument(level = "trace", skip(self), ret)]
    pub(crate) fn expand(&self) -> Result<Option<(String, PropertyValue)>, crate::parse::Error> {
        Ok(match &self.kind {
            DeclKind::Root(_) => unreachable!(),
            DeclKind::Plain(prop_name) => self
                .expand_plain_property_value()?
                .map(|plain_text| (prop_name.to_owned(), plain_text)),
            DeclKind::Linked(prop_name) => self
                .expand_linked_property_value()
                .map(|linkable_value| (prop_name.to_owned(), linkable_value)),
            DeclKind::Temporal(prop_name) => self
                .expand_temporal_property_value()?
                .map(|temporal_value| (prop_name.to_owned(), temporal_value)),
            DeclKind::Hypertext(prop_name) => {
                // Special h-review mapping: normalize property names to match current spec
                //
                // Per https://microformats.org/wiki/h-review (Backward Compatibility section):
                // - e-description (legacy) should parse as e-content
                // - e-content (current) is the canonical property name
                //
                // Both map to output property name "content" for consistency with the spec.
                // This ensures backward compatibility with existing markup while following
                // current microformats standards.
                let final_prop_name = if self.is_in_h_review_context() && prop_name == "description"
                {
                    "content".to_owned()
                } else {
                    prop_name.to_owned()
                };
                Some((final_prop_name, self.get_embedded_html()?))
            }
        })
    }
}

#[cfg(test)]
mod test;