use super::{
merge_hash_maps, non_empty_string,
property::{adjust_timestamps, explicit, implied, item, DeclKind},
remove_surrounding_whitespace, ElementPtr, ElementRef, Error,
};
use microformats_types::{Class, Document, Item, Properties};
use std::collections::BTreeMap;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use swc_html_ast::{Attribute, Child, Element, Text};
use swc_html_codegen::{
writer::basic::{BasicHtmlWriter, BasicHtmlWriterConfig, IndentType, LineFeed},
CodeGenerator, CodegenConfig, Emit,
};
#[cfg(feature = "debug_flow")]
use microformats_types::ElementSource;
#[derive(Clone, PartialEq, Eq)]
pub struct Node {
pub elem: Element,
}
impl std::fmt::Debug for Node {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Node")
.field("classes", &self.mf_classes())
.field("id", &self.id())
.field("tag", &self.tag())
.finish()
}
}
impl Node {
fn class_str(&self) -> String {
self.attr("class").unwrap_or_default()
}
fn id(&self) -> Option<String> {
self.attr("id").filter(super::non_empty_string)
}
pub(crate) fn mf_classes(&self) -> Vec<DeclKind> {
DeclKind::from_str(self.class_str())
}
pub(crate) fn property_classes(&self) -> Vec<DeclKind> {
self.mf_classes()
.into_iter()
.filter(|c| !c.is_root())
.collect::<Vec<_>>()
}
pub(crate) fn root_classes(&self) -> Vec<Class> {
DeclKind::extract_root_classes(&self.mf_classes())
}
pub(crate) fn attr(&self, name: &str) -> Option<String> {
self.elem
.attributes
.iter()
.find(|attr| attr.name == name)
.and_then(|attr| attr.value.as_ref().map(|v| v.to_string()))
}
pub(crate) fn tag(&self) -> &str {
&self.elem.tag_name
}
pub fn full_attribute_map(&self) -> BTreeMap<String, String> {
self.elem
.attributes
.iter()
.filter_map(|attr| {
attr.value
.as_ref()
.map(|v| (attr.name.to_string(), v.to_string()))
})
.collect()
}
#[tracing::instrument(level = "trace", skip(self), ret, fields(base_url = base_url.as_str()))]
pub(crate) fn text_content(
&self,
base_url: &url::Url,
) -> Result<Extraction, crate::parse::Error> {
InnerTextExtractor::new(self.elem.to_owned(), false, base_url.to_owned()).extract()
}
#[tracing::instrument(level = "trace", skip(self), ret, fields(base_url = base_url.as_str()))]
pub(crate) fn text_content_with_img_links(
&self,
base_url: &url::Url,
) -> Result<Extraction, crate::parse::Error> {
InnerTextExtractor::new(self.elem.to_owned(), true, base_url.to_owned()).extract()
}
#[tracing::instrument(level = "trace", skip(self), ret)]
pub(crate) fn html_content(&self) -> Result<String, crate::parse::Error> {
InnerHtmlExtractor::new(self.elem.to_owned()).extract()
}
pub fn elements(&self) -> Vec<Node> {
self.elem
.children
.iter()
.filter_map(|child| {
if let Child::Element(elem) = child {
Some(Self {
elem: elem.to_owned(),
})
} else {
None
}
})
.collect::<Vec<_>>()
}
#[cfg(feature = "debug_flow")]
pub fn capture_debug_info(&self) -> ElementSource {
ElementSource {
mf2_id: self.attr("data-mf2-id").filter(|s| !s.is_empty()),
tag: self.tag().to_string(),
classes: self
.class_str()
.split_ascii_whitespace()
.map(|s| s.to_string())
.collect(),
attributes: self.full_attribute_map(),
position: self.source_position(),
parent_ids: vec![], }
}
#[cfg(feature = "debug_flow")]
pub fn source_position(&self) -> microformats_types::SourcePosition {
use microformats_types::SourcePosition;
let offset = self.elem.span.lo.0 as usize;
SourcePosition {
line: 1, column: offset + 1, offset,
}
}
}
struct InnerTextExtractor {
lines: Vec<String>,
root: Element,
extract_src_of_img: bool,
base_url: url::Url,
link_nodes: Vec<Node>,
}
impl InnerTextExtractor {
fn new(root: Element, extract_src_of_img: bool, base_url: url::Url) -> Self {
Self {
lines: Default::default(),
root,
extract_src_of_img,
base_url,
link_nodes: Default::default(),
}
}
#[tracing::instrument(level = "trace", skip(self))]
fn extract(mut self) -> Result<Extraction, crate::parse::Error> {
let elem = self.root.to_owned();
self.emit(&elem)?;
let text = self
.lines
.into_iter()
.filter(non_empty_string)
.collect::<Vec<_>>()
.join("")
.to_string();
let mut links = Vec::default();
for link_node in self.link_nodes {
if let Some(href) = link_node.attr("href") {
links.push(if !href.starts_with("#") {
self.base_url.join(&href)?.to_string()
} else {
href
})
}
}
Ok(Extraction { text, links })
}
}
#[derive(Debug, PartialEq, Eq, Default)]
pub(crate) struct Extraction {
pub text: String,
pub links: Vec<String>,
}
impl From<Extraction> for String {
fn from(extract: Extraction) -> Self {
extract.text
}
}
impl swc_html_codegen::Emit<Child> for InnerTextExtractor {
#[tracing::instrument(level = "trace", skip(self, child), ret, name = "emit_child")]
fn emit(&mut self, child: &Child) -> swc_html_codegen::Result {
match child {
Child::Text(text) => self.emit(text),
Child::Element(elem) => self.emit(elem),
Child::Comment(_) => Ok(()),
_ => Ok(()),
}
}
}
impl swc_html_codegen::Emit<Element> for InnerTextExtractor {
#[tracing::instrument(level = "trace", skip(self, element), ret, name = "emit_element")]
fn emit(&mut self, element: &Element) -> swc_html_codegen::Result {
let node = Node {
elem: element.to_owned(),
};
if ["script", "style", "template"].contains(&node.tag()) {
return Ok(());
}
if node.attr("href").filter(|_| node.tag() == "a").is_some() {
self.link_nodes.push(node.clone());
}
if let Some(alt_text) = node.attr("alt").filter(|_| node.tag() == "img") {
self.lines.push(alt_text);
} else if let Some(src_url) = node
.attr("src")
.filter(|_| node.tag() == "img" && self.extract_src_of_img)
{
let full_url = self.base_url.join(&src_url).map_err(|_| std::fmt::Error)?;
self.lines.push(format!(" {full_url} "));
} else {
for child in element.children.iter() {
self.emit(&child)?;
}
}
Ok(())
}
}
impl swc_html_codegen::Emit<Text> for InnerTextExtractor {
#[tracing::instrument(level = "trace", skip(self, text_node), ret, name = "emit_text")]
fn emit(&mut self, text_node: &Text) -> swc_html_codegen::Result {
self.lines.push(text_node.data.as_str().to_string());
Ok(())
}
}
struct InnerHtmlExtractor {
root: Element,
}
impl InnerHtmlExtractor {
fn extract(self) -> Result<String, crate::parse::Error> {
let wtr_cfg = BasicHtmlWriterConfig {
indent_type: IndentType::Tab,
indent_width: 2,
linefeed: LineFeed::LF,
};
let cg_cfg = CodegenConfig {
minify: false,
scripting_enabled: true,
quotes: Some(true),
..Default::default()
};
let mut html = String::default();
let html_writer = BasicHtmlWriter::new(&mut html, None, wtr_cfg);
let mut generator = CodeGenerator::new(html_writer, cg_cfg);
for child in self.root.children {
generator.emit(&child)?;
}
Ok(remove_surrounding_whitespace(html))
}
fn new(root: Element) -> Self {
Self { root }
}
}
pub(crate) struct HtmlUrlExpander;
impl HtmlUrlExpander {
#[allow(dead_code)]
pub(crate) fn expand_urls_in_html(
element: &Element,
base_url: &url::Url,
) -> Result<String, crate::parse::Error> {
let wtr_cfg = BasicHtmlWriterConfig {
indent_type: IndentType::Tab,
indent_width: 2,
linefeed: LineFeed::LF,
};
let cg_cfg = CodegenConfig {
minify: false,
scripting_enabled: true,
quotes: Some(true),
..Default::default()
};
let mut html = String::default();
let html_writer = BasicHtmlWriter::new(&mut html, None, wtr_cfg);
let mut generator = CodeGenerator::new(html_writer, cg_cfg);
for child in &element.children {
generator.emit(child)?;
}
let html = remove_surrounding_whitespace(html);
Self::expand_urls_in_html_string(&html, base_url)
}
pub(crate) fn expand_urls_in_html_string(
html: &str,
base_url: &url::Url,
) -> Result<String, crate::parse::Error> {
use regex::Regex;
let href_regex = Regex::new(r#"(?i)\bhref\s*=\s*"([^"]*)""#).unwrap();
let src_regex = Regex::new(r#"(?i)\bsrc\s*=\s*"([^"]*)""#).unwrap();
let mut result = html.to_string();
result = href_regex
.replace_all(&result, |caps: ®ex::Captures| {
let url_str = &caps[1];
if !url_str.starts_with("http://")
&& !url_str.starts_with("https://")
&& !url_str.starts_with("#")
{
if let Ok(expanded) = base_url.join(url_str) {
return format!(r#"href="{}""#, expanded);
}
}
caps[0].to_string()
})
.to_string();
result = src_regex
.replace_all(&result, |caps: ®ex::Captures| {
let url_str = &caps[1];
if !url_str.starts_with("http://")
&& !url_str.starts_with("https://")
&& !url_str.starts_with("#")
{
if let Ok(expanded) = base_url.join(url_str) {
return format!(r#"src="{}""#, expanded);
}
}
caps[0].to_string()
})
.to_string();
let void_elements = [
"area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param",
"source", "track", "wbr",
];
for element in &void_elements {
let pattern = format!(r#"<{}[^>]*\s*/>"#, regex::escape(element));
let regex = Regex::new(&pattern).unwrap();
result = regex
.replace_all(&result, |caps: ®ex::Captures| {
caps[0].replace(r#" />"#, ">")
})
.to_string();
}
Ok(result)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Placement {
Root,
TopLevel {
node: ElementPtr,
},
ChildOf {
parent: ElementPtr,
child: ElementPtr,
},
PropertyItemOf {
property_elem: ElementPtr,
parent: ElementPtr,
},
PropertyOf {
parent: ElementPtr,
kind: DeclKind,
owner: ElementPtr,
},
BaseURL {
node: Node,
},
Meta {
node: Node,
},
Link {
node: Node,
},
DocumentLanguage {
language: String,
},
}
impl Placement {
fn property_parent_item(&self, item: ElementPtr) -> Result<Self, Error> {
match self {
Self::PropertyItemOf {
property_elem: node,
..
}
| Self::ChildOf { child: node, .. }
| Self::TopLevel { node } => Ok(Self::PropertyItemOf {
property_elem: item,
parent: Arc::clone(node),
}),
Self::Root => self.child_item(item),
_ => Err(Error::MissingParentItemForProperty(self.to_owned())),
}
}
#[tracing::instrument]
fn child_item(&self, item: ElementPtr) -> Result<Self, Error> {
match self {
Self::PropertyItemOf {
property_elem: node,
..
}
| Self::ChildOf { child: node, .. }
| Self::TopLevel { node } => Ok(Self::ChildOf {
child: item,
parent: Arc::clone(node),
}),
Self::Root => Ok(Self::TopLevel { node: item }),
_ => Err(Error::MissingParentItem(self.to_owned())),
}
}
#[tracing::instrument]
fn property_declarations(
&self,
item: ElementPtr,
property_classes: &[DeclKind],
) -> Result<Vec<Self>, Error> {
let parent = if let Ok(Self::PropertyItemOf { ref parent, .. }) =
self.property_parent_item(Arc::clone(&item))
{
Arc::clone(parent)
} else {
return Err(Error::MissingParentItemForPropertyDeclaration(
self.to_owned(),
));
};
property_classes
.iter()
.try_fold(Vec::default(), |mut placements, prop| {
placements.push(Self::PropertyOf {
parent: Arc::clone(&parent),
kind: prop.to_owned(),
owner: Arc::clone(&item),
});
Ok(placements)
})
}
}
#[derive(Default)]
pub(crate) struct MatchedElements {
pub places: Vec<Placement>,
pub elements: Vec<ElementPtr>,
pub hook: Option<Arc<dyn super::ParserHook>>,
pub enable_id_generation: bool,
pub id_counter: AtomicUsize,
}
impl std::fmt::Debug for MatchedElements {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_tuple("MatchedElements")
.field(&self.places)
.finish()
}
}
impl MatchedElements {
#[tracing::instrument(level = "trace", skip(self), fields(count = %self.places.len()))]
fn remember_place(&mut self, place: Placement) {
if !self.places.contains(&place) {
self.places.push(place);
}
}
#[tracing::instrument(level = "trace", skip(self, node), fields(count = %self.elements.len()))]
fn remember_element(&mut self, node: Node) -> ElementPtr {
let ptr = ElementPtr::new(ElementRef {
node,
index: self.elements.len(),
});
self.elements.push(Arc::clone(&ptr));
ptr
}
fn explicit_properties(
&self,
item: ElementPtr,
base_url: &url::Url,
) -> Result<Properties, crate::parse::Error> {
self.places
.iter()
.filter_map(|loc| {
if let Placement::PropertyOf {
parent,
kind,
owner,
} = loc
{
if parent.node.elem == item.node.elem {
Some(explicit::PropertyParser::new(
Arc::clone(owner),
kind.to_owned(),
base_url.to_owned(),
Some(Arc::clone(parent)),
))
} else {
None
}
} else {
None
}
})
.try_fold(Properties::default(), |mut properties, property_parser| {
if let Some((property_name, property_value)) = property_parser.expand()? {
if let Some(values) = properties.get_mut(&property_name) {
values.push(property_value.clone());
} else {
properties.insert(property_name.clone(), vec![property_value.clone()]);
}
if let Some(ref hook) = self.hook {
hook.on_property_matched(
&property_parser.elem.node,
&property_name,
&property_value,
);
}
}
Result::<_, crate::parse::Error>::Ok(properties)
})
}
fn item_properties(
&self,
item: ElementPtr,
base_url: &url::Url,
) -> Result<Properties, crate::parse::Error> {
self.places
.iter()
.filter_map(|loc| {
if let Placement::PropertyItemOf {
parent,
property_elem,
} = loc
{
if parent.node.elem == item.node.elem {
Some(item::PropertyParser::new(
Arc::clone(property_elem),
base_url,
))
} else {
None
}
} else {
None
}
})
.try_fold(Properties::default(), |mut properties, property_parser| {
let elem_clone = property_parser.property_elem.clone();
for (property_name, property_values) in property_parser.expand(self)? {
if let Some(values) = properties.get_mut(&property_name) {
values.extend(property_values.clone());
} else {
properties.insert(property_name.clone(), property_values.clone());
}
for value in &property_values {
if let Some(ref hook) = self.hook {
hook.on_property_matched(&elem_clone.node, &property_name, value);
}
}
}
Result::<_, crate::parse::Error>::Ok(properties)
})
}
#[tracing::instrument(level = "trace", skip(self, item), err, fields(base_url = base_url.to_string()))]
fn properties_for(
&self,
item: ElementPtr,
base_url: &url::Url,
) -> Result<Properties, crate::parse::Error> {
let mut properties = self.explicit_properties(Arc::clone(&item), base_url)?;
merge_hash_maps(
&mut properties,
self.item_properties(Arc::clone(&item), base_url)?,
);
adjust_timestamps(&mut properties);
Ok(properties)
}
#[tracing::instrument(level = "trace", skip(self, item), ret, err)]
fn children_for(
&self,
item: ElementPtr,
base_url: &url::Url,
) -> Result<Vec<Item>, crate::parse::Error> {
let elements = self
.places
.iter()
.filter_map(|loc| {
if let Placement::ChildOf { parent, child } = loc {
if *parent == item {
Some(child)
} else {
None
}
} else {
None
}
})
.collect::<Vec<_>>();
let expected_count = elements.len();
let resulting_items =
elements
.into_iter()
.try_fold(Vec::default(), |mut items, item_elem| {
let child_item =
self.expand_item_from_element(Arc::clone(item_elem), base_url)?;
items.push(child_item);
Result::<_, crate::parse::Error>::Ok(items)
})?;
assert_eq!(expected_count, resulting_items.len());
Ok(resulting_items)
}
#[tracing::instrument(level = "trace", skip(self), ret)]
pub(crate) fn top_level_elements(&self) -> Vec<ElementPtr> {
self.places
.iter()
.filter_map(|place| {
if let Placement::TopLevel { node } = place {
Some(Arc::clone(node))
} else {
None
}
})
.collect()
}
pub fn link_relation_elements(&self) -> Vec<Node> {
self.places
.iter()
.filter_map(|place| {
if let Placement::Link { node } = place {
Some(node.to_owned())
} else {
None
}
})
.collect()
}
pub(crate) fn discern_base_url(&self) -> Option<url::Url> {
self.places.iter().find_map(|place| {
if let Placement::BaseURL { node } = place {
let url_base = node.attr("href").filter(non_empty_string)?;
if let Ok(mut base_url) = url_base.parse::<url::Url>() {
if base_url.path().is_empty() {
base_url.set_path("/");
}
Some(base_url)
} else {
None
}
} else {
None
}
})
}
#[tracing::instrument(level = "trace", skip(self, item_elem), ret, err, fields(base_url = base_url.to_string()))]
pub(crate) fn expand_item_from_element(
&self,
item_elem: ElementPtr,
base_url: &url::Url,
) -> Result<Item, crate::parse::Error> {
let document_language = if let Some(Placement::DocumentLanguage { language }) = self
.places
.iter()
.find(|pos| matches!(pos, Placement::DocumentLanguage { .. }))
{
Some(language.to_owned())
} else {
None
};
let mut item = Item::new(item_elem.node.root_classes());
item.id = item_elem.node.id();
item.lang = item_elem
.node
.attr("lang")
.filter(non_empty_string)
.or(document_language);
item.children
.extend(self.children_for(Arc::clone(&item_elem), base_url)?);
let mut properties = self.properties_for(Arc::clone(&item_elem), base_url)?;
if item.children.is_empty() {
let props = properties.to_owned();
merge_hash_maps(
&mut properties,
implied::PropertiesParser::new(Arc::clone(&item_elem), props, base_url)
.extract_implied()?,
)
}
for (name, values) in &properties {
for value in values {
if let Some(ref hook) = self.hook {
hook.on_property_matched(&item_elem.node, name, value);
}
}
}
item.properties.extend(properties);
Ok(item)
}
fn translate_location(
&mut self,
parent_location: &Placement,
element: &mut Element,
) -> Result<Placement, Error> {
let node = Node {
elem: element.clone(),
};
if node.tag() == "a"
&& node.attr("rel").filter(non_empty_string).is_some()
&& node.attr("href").is_some()
{
self.remember_place(Placement::Link { node: node.clone() });
}
if let Some(language) = node
.attr("lang")
.filter(non_empty_string)
.filter(|_| node.tag() == "html")
{
self.remember_place(Placement::DocumentLanguage { language })
}
if node.mf_classes().is_empty() {
if node.tag() == "base" {
self.remember_place(Placement::BaseURL { node });
} else if node.tag() == "meta" {
self.remember_place(Placement::Meta { node });
} else if node.tag() == "link" && node.attr("href").is_some() {
self.remember_place(Placement::Link { node });
}
return Ok(parent_location.to_owned());
}
let is_item_elem = !node.root_classes().is_empty();
let explicit_property_classes = node.property_classes();
let is_property = !explicit_property_classes.is_empty();
let elem_ref = self.remember_element(node);
if self.enable_id_generation {
let id = self.id_counter.fetch_add(1, Ordering::Relaxed);
element.attributes.push(Attribute {
span: Default::default(),
namespace: None,
prefix: None,
name: "data-mf2-id".into(),
raw_name: None,
value: Some(format!("mf2-{}", id).into()),
raw_value: None,
});
}
let child_location = if is_item_elem && !is_property {
Some(parent_location.child_item(Arc::clone(&elem_ref))?)
} else if is_property {
if is_item_elem {
Some(parent_location.property_parent_item(Arc::clone(&elem_ref))?)
} else if let Ok(property_decls) = parent_location
.property_declarations(Arc::clone(&elem_ref), &explicit_property_classes)
{
for property_decl in property_decls {
self.remember_place(property_decl);
}
Some(parent_location.to_owned())
} else {
None
}
} else {
None
};
if let Some(child_location) = child_location {
self.remember_place(child_location.to_owned());
Ok(child_location)
} else {
Ok(Placement::Root)
}
}
#[tracing::instrument(level = "trace", ret, err, skip(self, children))]
pub(crate) fn walk_over_children(
&mut self,
children: &mut [Child],
location: &Placement,
) -> Result<(), crate::parse::Error> {
for child in children.iter_mut() {
if let Child::Element(element) = child {
let local_location = self.translate_location(location, element)?;
self.walk_over_children(&mut element.children, &local_location)?;
}
}
Ok(())
}
pub(crate) fn for_document(
dom: &mut swc_html_ast::Document,
hook: Option<Arc<dyn super::ParserHook>>,
enable_id_generation: bool,
) -> Result<Self, crate::parse::Error> {
let mut elements = Self {
places: Vec::new(),
elements: Vec::new(),
hook,
enable_id_generation,
id_counter: AtomicUsize::new(0),
};
elements.walk_over_children(&mut dom.children, &Placement::Root)?;
Ok(elements)
}
#[allow(dead_code)]
pub(crate) fn for_document_default(
dom: &mut swc_html_ast::Document,
) -> Result<Self, crate::parse::Error> {
Self::for_document(dom, None, false)
}
}
pub struct LinkRelationExpander {
pub base_url: url::Url,
pub elements: Vec<Node>,
}
impl LinkRelationExpander {
pub fn expand(self, document: &mut Document) -> Result<(), crate::parse::Error> {
for node in self.elements {
let rel_str = node
.attr("rel")
.filter(non_empty_string)
.map(remove_surrounding_whitespace)
.unwrap_or_default();
if rel_str.is_empty() {
continue;
}
let url = self
.base_url
.join(
&node
.attr("href")
.filter(non_empty_string)
.map(remove_surrounding_whitespace)
.unwrap_or_default(),
)
.map_err(crate::parse::Error::from)?;
let rels = rel_str
.split(' ')
.map(ToString::to_string)
.collect::<Vec<_>>();
let hreflang = node.attr("hreflang").filter(non_empty_string);
let title = node.attr("title").filter(non_empty_string);
let media = node.attr("media").filter(non_empty_string);
let r#type = node.attr("type").filter(non_empty_string);
let Extraction { text, .. } = node.text_content(&self.base_url)?;
let relation = microformats_types::Relation {
rels,
hreflang,
media,
title,
r#type,
text: Some(text),
};
document.add_relation(url.clone(), relation);
}
Ok(())
}
}
#[cfg(test)]
pub(crate) mod test;