use std::sync::Arc;
use self::element::{LinkRelationExpander, MatchedElements};
use microformats_types::{Properties, PropertyValue};
use regex::Regex;
use swc_common::{BytePos, FileName, SourceFile};
use swc_html_codegen::Emit as _;
use swc_html_parser::parser::ParserConfig;
pub trait ParserHook: Send + Sync {
fn on_property_matched(&self, node: &element::Node, name: &str, value: µformats_types::PropertyValue);
fn on_item_matched(&self, node: &element::Node, item_type: &str);
}
mod element;
mod head;
mod property;
mod test;
mod value_class;
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
pub enum Error {
#[error("Failed to parse HTML: {0:?}")]
Html(swc_html_parser::error::Error),
#[error("Failed to generate HTML: {0}")]
HtmlCodegen(String),
#[error("Missing the parent item for a child item at the location {0:?}")]
MissingParentItem(crate::parse::element::Placement),
#[error("Invalid property for expansion.")]
InvalidPropertyExpansion,
#[error("Could not determine which item to add a property to the location of {0:?}")]
MissingParentItemForProperty(element::Placement),
#[error(
"Could not determine which parent item to define a property to from the location of {0:?}"
)]
MissingParentItemForPropertyDeclaration(element::Placement),
#[error("A URL to base relative URLs in this document is required.")]
UrlBaseForDocumentRequired,
#[error(transparent)]
Types(#[from] microformats_types::Error),
#[error(transparent)]
Fmt(#[from] std::fmt::Error),
#[error(transparent)]
Url(#[from] url::ParseError),
}
impl From<swc_html_parser::error::Error> for Error {
fn from(value: swc_html_parser::error::Error) -> Self {
Self::Html(value)
}
}
impl From<microformats_types::temporal::Error> for Error {
fn from(value: microformats_types::temporal::Error) -> Self {
Self::Types(microformats_types::Error::from(value))
}
}
lazy_static::lazy_static! {
static ref RE_WHITESPACE: Regex = Regex::new(r"(\s)+").unwrap();
static ref RE_CLASS_NAME: Regex = Regex::new(r#"^(?P<prefix>((h|p|u|dt|e){1}))-(?P<name>([a-z0-9]+-)?[a-z]+(-[a-z]+)*)$"#).unwrap();
}
#[allow(clippy::ptr_arg)]
fn non_empty_string(s: &String) -> bool {
!s.is_empty()
}
fn non_empty_property_value(p: &PropertyValue) -> bool {
!p.is_empty()
}
fn remove_surrounding_whitespace(text: impl ToString) -> String {
text.to_string()
.trim_matches(char::is_whitespace)
.to_string()
}
fn find_head_element(dom: &swc_html_ast::Document) -> Option<swc_html_ast::Element> {
for child in &dom.children {
if let swc_html_ast::Child::Element(element) = child {
if element.tag_name.to_string() == "html" {
for html_child in &element.children {
if let swc_html_ast::Child::Element(html_element) = html_child {
if html_element.tag_name.to_string() == "head" {
return Some(html_element.clone());
}
}
}
}
}
}
None
}
fn merge_hash_maps(base_map: &mut Properties, addl_map: Properties) {
for (property_name, property_value) in addl_map.into_iter() {
if let Some(values) = base_map.get_mut(&property_name) {
values.extend(property_value);
} else {
base_map.insert(property_name, property_value);
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ElementRef {
pub index: usize,
pub node: element::Node,
}
pub type ElementPtr = Arc<ElementRef>;
pub struct Parser {
dom: swc_html_ast::Document,
hook: Option<Arc<dyn ParserHook>>,
enable_id_generation: bool,
}
impl Clone for Parser {
fn clone(&self) -> Self {
Self {
dom: self.dom.clone(),
hook: self.hook.clone(),
enable_id_generation: self.enable_id_generation,
}
}
}
impl std::fmt::Debug for Parser {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str("Parser")
}
}
impl Parser {
#[tracing::instrument(level = "trace", err, fields(html = html.len()))]
pub fn from_html(html: String) -> Result<Self, crate::Error> {
let config = ParserConfig {
scripting_enabled: false,
iframe_srcdoc: false,
allow_self_closing: true,
};
let mut html_errors = Default::default();
let source_file = SourceFile::new(
FileName::Anon.into(),
false,
FileName::Anon.into(),
html.into(),
BytePos(1),
);
let dom = swc_html_parser::parse_file_as_document(&source_file, config, &mut html_errors)
.map_err(Error::from)?;
drop(html_errors);
Ok(Self { dom, hook: None, enable_id_generation: false })
}
pub fn with_hook(mut self, hook: Arc<dyn ParserHook>) -> Self {
self.hook = Some(hook);
self
}
pub fn with_id_generation(mut self, enable: bool) -> Self {
self.enable_id_generation = enable;
self
}
#[tracing::instrument(level = "trace", skip(self), err, fields(base_url = base_url.as_ref().map(|u|u.to_string())))]
pub fn into_document(
&mut self,
base_url: Option<url::Url>,
) -> Result<microformats_types::Document, crate::Error> {
let mut doc: microformats_types::Document = Default::default();
let matched_elements = MatchedElements::for_document(&mut self.dom, self.hook.clone(), self.enable_id_generation)?;
let base_url = matched_elements
.discern_base_url()
.or(base_url)
.ok_or(Error::UrlBaseForDocumentRequired)?;
let link_relation_expander = LinkRelationExpander {
base_url: base_url.clone(),
elements: matched_elements.link_relation_elements(),
};
link_relation_expander.expand(&mut doc)?;
for item_elem_ptr in matched_elements.top_level_elements() {
let item_elem_ptr_clone = item_elem_ptr.clone();
let item = matched_elements.expand_item_from_element(item_elem_ptr, &base_url)?;
if let Some(hook) = &self.hook {
let item_type = item.r#type.first().map(|c| c.to_string()).unwrap_or_else(|| "unknown".to_string());
hook.on_item_matched(&item_elem_ptr_clone.node, &item_type);
}
doc.items.push(item)
}
#[cfg(feature = "metaformats")]
{
if let Some(head_element) = find_head_element(&self.dom) {
if let Some(meta_item) = head::parse_metaformats_from_head(&head_element, &base_url, doc.url.as_ref()) {
doc.meta_item = Some(meta_item);
}
}
}
Ok(doc)
}
pub fn to_html(&self) -> Result<String, crate::Error> {
use swc_html_codegen::{
writer::basic::{BasicHtmlWriter, BasicHtmlWriterConfig, IndentType, LineFeed},
CodeGenerator, CodegenConfig, Emit,
};
let mut buf = std::ffi::OsString::new();
let mut writer = BasicHtmlWriter::new(
&mut buf,
None,
BasicHtmlWriterConfig {
indent_type: IndentType::Space,
indent_width: 2,
linefeed: LineFeed::LF,
},
);
let mut generator = CodeGenerator::new(&mut writer, CodegenConfig {
minify: false,
scripting_enabled: true,
context_element: None,
tag_omission: Some(true),
keep_head_and_body: Some(true),
self_closing_void_elements: Some(true),
quotes: Some(true),
});
generator.emit(&self.dom).map_err(|e| crate::Error::HtmlCodegen(e.to_string()))?;
buf.into_string().map_err(|_| crate::Error::HtmlCodegen("Invalid UTF-8 in generated HTML".to_string()))
}
pub fn builder() -> ParserBuilder {
ParserBuilder::default()
}
}
#[derive(Default)]
pub struct ParserBuilder {
html: Option<String>,
hook: Option<Arc<dyn ParserHook>>,
enable_id_generation: bool,
}
impl ParserBuilder {
pub fn with_html(mut self, html: impl Into<String>) -> Self {
self.html = Some(html.into());
self
}
pub fn with_hook(mut self, hook: Arc<dyn ParserHook>) -> Self {
self.hook = Some(hook);
self
}
pub fn with_id_generation(mut self, enable: bool) -> Self {
self.enable_id_generation = enable;
self
}
pub fn build(self) -> Result<Parser, crate::Error> {
let html = self.html.ok_or_else(|| std::io::Error::new(std::io::ErrorKind::InvalidInput, "HTML content not provided"))?;
let mut parser = Parser::from_html(html)?;
if let Some(h) = self.hook {
parser = parser.with_hook(h);
}
parser = parser.with_id_generation(self.enable_id_generation);
Ok(parser)
}
}