microformats 0.17.0

A union library of the Microformats types and associated parser.
Documentation
use std::sync::Arc;

use self::element::{LinkRelationExpander, MatchedElements};
use microformats_types::{Properties, PropertyValue};
use regex::Regex;
use swc_common::{BytePos, FileName, SourceFile};
use swc_html_codegen::Emit as _;
use swc_html_parser::parser::ParserConfig;

/// A trait for custom hooks that can be called during parsing to tag or process nodes.
pub trait ParserHook: Send + Sync {
    /// Called when a property is matched on a node.
    fn on_property_matched(&self, node: &element::Node, name: &str, value: &microformats_types::PropertyValue);

    /// Called when an item is matched on a node.
    fn on_item_matched(&self, node: &element::Node, item_type: &str);
}

mod element;
mod head;
mod property;
mod test;
mod value_class;

#[derive(thiserror::Error, Debug, PartialEq, Eq)]
pub enum Error {
    #[error("Failed to parse HTML: {0:?}")]
    Html(swc_html_parser::error::Error),

    #[error("Failed to generate HTML: {0}")]
    HtmlCodegen(String),

    #[error("Missing the parent item for a child item at the location {0:?}")]
    MissingParentItem(crate::parse::element::Placement),

    #[error("Invalid property for expansion.")]
    InvalidPropertyExpansion,

    #[error("Could not determine which item to add a property to the location of {0:?}")]
    MissingParentItemForProperty(element::Placement),

    #[error(
        "Could not determine which parent item to define a property to from the location of {0:?}"
    )]
    MissingParentItemForPropertyDeclaration(element::Placement),

    #[error("A URL to base relative URLs in this document is required.")]
    UrlBaseForDocumentRequired,

    #[error(transparent)]
    Types(#[from] microformats_types::Error),

    #[error(transparent)]
    Fmt(#[from] std::fmt::Error),

    #[error(transparent)]
    Url(#[from] url::ParseError),
}

impl From<swc_html_parser::error::Error> for Error {
    fn from(value: swc_html_parser::error::Error) -> Self {
        Self::Html(value)
    }
}

impl From<microformats_types::temporal::Error> for Error {
    fn from(value: microformats_types::temporal::Error) -> Self {
        Self::Types(microformats_types::Error::from(value))
    }
}

lazy_static::lazy_static! {
    static ref RE_WHITESPACE: Regex = Regex::new(r"(\s)+").unwrap();
    static ref RE_CLASS_NAME: Regex = Regex::new(r#"^(?P<prefix>((h|p|u|dt|e){1}))-(?P<name>([a-z0-9]+-)?[a-z]+(-[a-z]+)*)$"#).unwrap();
}

#[allow(clippy::ptr_arg)]
fn non_empty_string(s: &String) -> bool {
    !s.is_empty()
}

fn non_empty_property_value(p: &PropertyValue) -> bool {
    !p.is_empty()
}

fn remove_surrounding_whitespace(text: impl ToString) -> String {
    text.to_string()
        .trim_matches(char::is_whitespace)
        .to_string()
}

/// Find the head element in the document
fn find_head_element(dom: &swc_html_ast::Document) -> Option<swc_html_ast::Element> {
    for child in &dom.children {
        if let swc_html_ast::Child::Element(element) = child {
            if element.tag_name.to_string() == "html" {
                for html_child in &element.children {
                    if let swc_html_ast::Child::Element(html_element) = html_child {
                        if html_element.tag_name.to_string() == "head" {
                            return Some(html_element.clone());
                        }
                    }
                }
            }
        }
    }
    None
}

fn merge_hash_maps(base_map: &mut Properties, addl_map: Properties) {
    for (property_name, property_value) in addl_map.into_iter() {
        if let Some(values) = base_map.get_mut(&property_name) {
            values.extend(property_value);
        } else {
            base_map.insert(property_name, property_value);
        }
    }
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ElementRef {
    pub index: usize,
    pub node: element::Node,
}

pub type ElementPtr = Arc<ElementRef>;

pub struct Parser {
    dom: swc_html_ast::Document,
    hook: Option<Arc<dyn ParserHook>>,
    enable_id_generation: bool,
}

impl Clone for Parser {
    fn clone(&self) -> Self {
        Self {
            dom: self.dom.clone(),
            hook: self.hook.clone(),
            enable_id_generation: self.enable_id_generation,
        }
    }
}

impl std::fmt::Debug for Parser {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str("Parser")
    }
}

impl Parser {
    /// Parses the provided HTML into a DOM document prepared for Microformats parsing.
    ///
    /// # Errors
    ///
    /// This function will return an error if the HTML could not be parsed.
    #[tracing::instrument(level = "trace", err, fields(html = html.len()))]
    pub fn from_html(html: String) -> Result<Self, crate::Error> {
        let config = ParserConfig {
            scripting_enabled: false,
            iframe_srcdoc: false,
            allow_self_closing: true,
        };
        let mut html_errors = Default::default();
        let source_file = SourceFile::new(
            FileName::Anon.into(),
            false,
            FileName::Anon.into(),
            html.into(),
            BytePos(1),
        );
        let dom = swc_html_parser::parse_file_as_document(&source_file, config, &mut html_errors)
            .map_err(Error::from)?;

        drop(html_errors); // TODO: Report this back to the caller.

        Ok(Self { dom, hook: None, enable_id_generation: false })
    }

    /// Sets a custom hook for the parser.
    pub fn with_hook(mut self, hook: Arc<dyn ParserHook>) -> Self {
        self.hook = Some(hook);
        self
    }

    /// Enables or disables ID generation for AST elements.
    pub fn with_id_generation(mut self, enable: bool) -> Self {
        self.enable_id_generation = enable;
        self
    }

    /// With the loaded DOM in memory, parses it into a [structured document][microformats_types::Document].
    ///
    /// # Errors
    ///
    /// This function will return an error if the DOM could not be parsed, relations could not be
    /// expanded or if items could not be expanded.
    #[tracing::instrument(level = "trace", skip(self), err, fields(base_url = base_url.as_ref().map(|u|u.to_string())))]
    pub fn into_document(
        &mut self,
        base_url: Option<url::Url>,
    ) -> Result<microformats_types::Document, crate::Error> {
        let mut doc: microformats_types::Document = Default::default();
        let matched_elements = MatchedElements::for_document(&mut self.dom, self.hook.clone(), self.enable_id_generation)?;

        let base_url = matched_elements
            .discern_base_url()
            .or(base_url)
            .ok_or(Error::UrlBaseForDocumentRequired)?;

        let link_relation_expander = LinkRelationExpander {
            base_url: base_url.clone(),
            elements: matched_elements.link_relation_elements(),
        };

        link_relation_expander.expand(&mut doc)?;

        for item_elem_ptr in matched_elements.top_level_elements() {
            let item_elem_ptr_clone = item_elem_ptr.clone();
            let item = matched_elements.expand_item_from_element(item_elem_ptr, &base_url)?;
            if let Some(hook) = &self.hook {
                let item_type = item.r#type.first().map(|c| c.to_string()).unwrap_or_else(|| "unknown".to_string());
                hook.on_item_matched(&item_elem_ptr_clone.node, &item_type);
            }
            doc.items.push(item)
        }

        // Parse metaformats from head element
        #[cfg(feature = "metaformats")]
        {
            if let Some(head_element) = find_head_element(&self.dom) {
                if let Some(meta_item) = head::parse_metaformats_from_head(&head_element, &base_url, doc.url.as_ref()) {
                    doc.meta_item = Some(meta_item);
                }
            }
        }

        Ok(doc)
    }

    /// Generates HTML from the current AST, including any added attributes like data-mf2-id.
    ///
    /// # Errors
    ///
    /// This function will return an error if HTML generation fails.
    pub fn to_html(&self) -> Result<String, crate::Error> {
use swc_html_codegen::{
    writer::basic::{BasicHtmlWriter, BasicHtmlWriterConfig, IndentType, LineFeed},
    CodeGenerator, CodegenConfig, Emit,
};

        let mut buf = std::ffi::OsString::new();
        let mut writer = BasicHtmlWriter::new(
            &mut buf,
            None,
            BasicHtmlWriterConfig {
                indent_type: IndentType::Space,
                indent_width: 2,
                linefeed: LineFeed::LF,
            },
        );
        let mut generator = CodeGenerator::new(&mut writer, CodegenConfig {
            minify: false,
            scripting_enabled: true,
            context_element: None,
            tag_omission: Some(true),
            keep_head_and_body: Some(true),
            self_closing_void_elements: Some(true),
            quotes: Some(true),
        });
        generator.emit(&self.dom).map_err(|e| crate::Error::HtmlCodegen(e.to_string()))?;
        buf.into_string().map_err(|_| crate::Error::HtmlCodegen("Invalid UTF-8 in generated HTML".to_string()))
    }

    /// Creates a builder for constructing a Parser with custom options.
    pub fn builder() -> ParserBuilder {
        ParserBuilder::default()
    }
}

/// Builder for constructing a Parser with custom options.
#[derive(Default)]
pub struct ParserBuilder {
    html: Option<String>,
    hook: Option<Arc<dyn ParserHook>>,
    enable_id_generation: bool,
}

impl ParserBuilder {
    /// Sets the HTML content to parse.
    pub fn with_html(mut self, html: impl Into<String>) -> Self {
        self.html = Some(html.into());
        self
    }

    /// Sets a custom hook for the parser.
    pub fn with_hook(mut self, hook: Arc<dyn ParserHook>) -> Self {
        self.hook = Some(hook);
        self
    }

    /// Enables or disables ID generation for AST elements.
    pub fn with_id_generation(mut self, enable: bool) -> Self {
        self.enable_id_generation = enable;
        self
    }

    /// Builds the Parser with the configured options.
    ///
    /// # Errors
    ///
    /// This function will return an error if HTML parsing fails or if no HTML was provided.
    pub fn build(self) -> Result<Parser, crate::Error> {
        let html = self.html.ok_or_else(|| std::io::Error::new(std::io::ErrorKind::InvalidInput, "HTML content not provided"))?;

        let mut parser = Parser::from_html(html)?;
        if let Some(h) = self.hook {
            parser = parser.with_hook(h);
        }
        parser = parser.with_id_generation(self.enable_id_generation);
        Ok(parser)
    }
}