microformats 0.17.0

A union library of the Microformats types and associated parser.
Documentation
use std::str::FromStr;

use crate::parse::{element::{Extraction, HtmlUrlExpander}, remove_surrounding_whitespace};

use super::*;
use html_escape::decode_html_entities;
pub(crate) struct PropertyParser {
    pub(crate) elem: ElementPtr,
    kind: DeclKind,
    base_url: Url,
}

impl std::fmt::Debug for PropertyParser {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> swc_html_codegen::Result {
        f.debug_struct("PropertyParser")
            .field("forKind", &self.kind)
            .finish()
    }
}

impl PropertyParser {
    #[tracing::instrument(level = "trace", ret)]
    pub(crate) fn new(elem: ElementPtr, kind: DeclKind, base_url: Url) -> Self {
        Self {
            elem,
            kind,
            base_url,
        }
    }
    #[tracing::instrument(level = "trace", skip(self), ret)]
    fn expand_plain_property_value(&self) -> Result<Option<PropertyValue>, crate::parse::Error> {
        let elem = &self.elem.node;
        // 1: Check for VCP first; use if found.
        Ok(value_class::ValueClassPropertyExtractor {
            element: Arc::clone(&self.elem),
            hint: value_class::TypeHint::Plain,
        }
        .extract_value_class(&self.base_url)?
        .filter(non_empty_property_value)
        // 2. If abbr.p-x[title] or link.p-x[title], return the title attribute.
        .or_else(|| {
            elem.attr("title")
                .filter(|_| ["abbr", "link"].contains(&elem.tag()))
                .filter(non_empty_string)
                .map(PropertyValue::Plain)
        })
        // 3. else if data.p-x[value] or input.p-x[value], then return the value attribute
        .or_else(|| {
            elem.attr("value")
                .filter(|_| ["data"].contains(&elem.tag()))
                .filter(non_empty_string)
                .map(PropertyValue::Plain)
        })
        // 4. else if img.p-x[alt] or area.p-x[alt], then return the alt attribute
        .or_else(|| {
            elem.attr("alt")
                .filter(|_| ["img", "area"].contains(&elem.tag()))
                .filter(non_empty_string)
                .map(PropertyValue::Plain)
        })
        // 5. else return the textContent of the element
        .or_else(|| {
            elem.text_content_with_img_links(&self.base_url)
                .ok()
                .map(|Extraction { text, .. }| PropertyValue::Plain(text))
        }))
        .map(|value_opt| {
            value_opt.map(|value| {
                if let PropertyValue::Plain(plain_text_value) = value {
                    PropertyValue::Plain(remove_surrounding_whitespace(plain_text_value))
                } else {
                    value
                }
            })
        })
    }

    fn expand_linked_property_value(&self) -> Option<PropertyValue> {
        let node = &self.elem.node;

        // 1. if a.u-x[href] or area.u-x[href] or link.u-x[href], then get the href attribute
        let text_value = node
            .attr("href")
            .filter(|_| ["a", "area", "link"].contains(&node.tag()))
            .or_else(|| {
                // 3. else if audio.u-x[src] or video.u-x[src] or source.u-x[src] or iframe.u-x[src], then get the src attribute
                node.attr("src")
                    .filter(|_| ["audio", "video", "source", "iframe"].contains(&node.tag()))
            })
            .or_else(|| {
                // 4. else if video.u-x[poster], then get the poster attribute
                node.attr("poster")
                    .filter(non_empty_string)
                    .filter(|_| ["video"].contains(&node.tag()))
            })
            .or_else(|| {
                // 5. else if object.u-x[data], then get the data attribute
                node.attr("data")
                    .filter(non_empty_string)
                    .filter(|_| ["object"].contains(&node.tag()))
            })
            .or_else(|| {
                // 6. else parse the element for the value-class-pattern. If a value is found, get it
                // TODO: Refactor this to properly throw an exception.
                if let Some(PropertyValue::Plain(value)) =
                    (value_class::ValueClassPropertyExtractor {
                        element: Arc::clone(&self.elem),
                        hint: value_class::TypeHint::Plain,
                    }
                    .extract_value_class(&self.base_url)
                    .ok()
                    .flatten()
                    .filter(non_empty_property_value))
                {
                    Some(value)
                } else {
                    None
                }
            })
            .or_else(|| {
                // 7. else if abbr.u-x[title], then get the title attribute
                node.attr("title")
                    .filter(non_empty_string)
                    .filter(|_| ["abbr"].contains(&node.tag()))
            })
            .or_else(|| {
                // 8. else if data.u-x[value] or input.u-x[value], then get the value attribute
                node.attr("value")
                    .filter(non_empty_string)
                    .filter(|_| ["input", "data"].contains(&node.tag()))
            })
            .or_else(|| {
                // 9. else get the textContent of the element after removing all leading/trailing spaces and nested <script> & <style> elements
                node.text_content(&self.base_url).ok().map(Into::into)
            });

        let linked_text = text_value.unwrap_or_default();

        if node.tag() == "img" {
            // 2. image processing
            extract_img_element(node, &self.base_url)
        } else if let Ok(u) = self.base_url.join(&linked_text) {
            Some(PropertyValue::Url(u))
        } else {
            Some(PropertyValue::Plain(linked_text))
        }
    }

    #[tracing::instrument(level = "trace", skip(self), ret)]
    fn expand_temporal_property_value(&self) -> Result<Option<PropertyValue>, crate::parse::Error> {
        let elem = &self.elem.node;

        // 1. parse the element for the value-class-pattern, including the date and time parsing rules. If a value is found, then return it.
        let vcp_value = value_class::ValueClassPropertyExtractor {
            element: Arc::clone(&self.elem),
            hint: value_class::TypeHint::Temporal,
        }
        .extract_value_class(&self.base_url)?;

        if vcp_value.is_none() {
            // 2. if time.dt-x[datetime] or ins.dt-x[datetime] or del.dt-x[datetime], then return the datetime attribute
            let text_value = elem
                .attr("datetime")
                .filter(|_| ["time", "ins", "del"].contains(&elem.tag()))
                .filter(non_empty_string)
                .or_else(|| {
                    // 3. else if abbr.dt-x[title], then return the title attribute
                    elem.attr("title")
                        .filter(|_| ["abbr"].contains(&elem.tag()))
                        .filter(non_empty_string)
                })
                .or_else(|| {
                    // 4. else if data.dt-x[value] or input.dt-x[value], then return the value attribute
                    elem.attr("value")
                        .filter(|_| ["data", "input"].contains(&elem.tag()))
                        .filter(non_empty_string)
                });
            let dt_str = if let Some(v) = text_value {
                v
            } else {
                // 5. else return the textContent of the element after removing all leading/trailing spaces and nested <script> & <style> elements.
                elem.text_content(&self.base_url).map(Into::into)?
            };
            Ok(
                if let Ok(value) = microformats_types::temporal::Value::from_str(&dt_str) {
                    Some(PropertyValue::Temporal(value)).filter(non_empty_property_value)
                } else {
                    Some(PropertyValue::Plain(dt_str)).filter(non_empty_property_value)
                },
            )
        } else {
            Ok(vcp_value)
        }
    }

    #[tracing::instrument(level = "trace", skip(self), ret)]
    fn get_embedded_html(&self) -> Result<PropertyValue, crate::parse::Error> {
        let raw_html = self.elem.node.html_content()?;
        let html = HtmlUrlExpander::expand_urls_in_html_string(&raw_html, &self.base_url)?;
        let Extraction { text: raw_value, links } =
            self.elem.node.text_content_with_img_links(&self.base_url)?;

        Ok(PropertyValue::Fragment(Fragment {
            html,
            value: decode_html_entities(&remove_surrounding_whitespace(raw_value)).to_string(),
            links,
            lang: self.elem.node.attr("lang"),
        }))
    }

    #[tracing::instrument(level = "trace", skip(self), ret)]
    pub(crate) fn expand(&self) -> Result<Option<(String, PropertyValue)>, crate::parse::Error> {
        Ok(match &self.kind {
            DeclKind::Root(_) => unreachable!(),
            DeclKind::Plain(prop_name) => self
                .expand_plain_property_value()?
                .map(|plain_text| (prop_name.to_owned(), plain_text)),
            DeclKind::Linked(prop_name) => self
                .expand_linked_property_value()
                .map(|linkable_value| (prop_name.to_owned(), linkable_value)),
            DeclKind::Temporal(prop_name) => self
                .expand_temporal_property_value()?
                .map(|temporal_value| (prop_name.to_owned(), temporal_value)),
            DeclKind::Hypertext(prop_name) => {
                Some((prop_name.to_owned(), self.get_embedded_html()?))
            }
        })
    }
}

#[cfg(test)]
mod test;