use markdown::mdast;
use super::css::*;
use super::style::*;
#[derive(Debug, Clone)]
pub struct Node {
pub kind: NodeKind,
pub style: Style,
pub splittable: bool,
}
impl Node {
pub fn new(kind: NodeKind, style: Style, splittable: bool) -> Self {
Self {
kind,
style,
splittable,
}
}
pub fn text_content(&self) -> String {
self.kind.text_content()
}
}
#[derive(Debug, Clone)]
pub enum NodeKind {
Document { children: Vec<Node> },
Heading { level: u8, children: Vec<Node> },
Paragraph { children: Vec<Node> },
List {
ordered: bool,
start: Option<u32>,
children: Vec<Node>,
},
ListItem { children: Vec<Node> },
TaskListItem { checked: bool, children: Vec<Node> },
Image {
src: String,
alt: String,
title: Option<String>,
},
CodeBlock { code: String, lang: Option<String> },
Blockquote { children: Vec<Node> },
ThematicBreak,
Table {
children: Vec<Node>,
align: Vec<TextAlign>,
},
TableRow { children: Vec<Node> },
Text { text: String },
Strong { children: Vec<Node> },
Emphasis { children: Vec<Node> },
InlineCode { code: String },
Link {
url: String,
title: Option<String>,
children: Vec<Node>,
},
Delete { children: Vec<Node> },
Span { children: Vec<Node> },
Center { children: Vec<Node> },
}
impl NodeKind {
pub fn text_content(&self) -> String {
match self {
NodeKind::Text { text } => text.clone(),
NodeKind::Strong { children }
| NodeKind::Emphasis { children }
| NodeKind::Link { children, .. }
| NodeKind::Delete { children } => {
let mut s = String::new();
for child in children {
s.push_str(&child.text_content());
}
s
}
NodeKind::InlineCode { code } => code.clone(),
NodeKind::Heading { children, .. }
| NodeKind::Paragraph { children }
| NodeKind::ListItem { children }
| NodeKind::TaskListItem { children, .. }
| NodeKind::Blockquote { children }
| NodeKind::TableRow { children }
| NodeKind::Span { children }
| NodeKind::Center { children } => {
let mut s = String::new();
for child in children {
s.push_str(&child.text_content());
}
s
}
_ => String::new(),
}
}
}
pub fn build_ast(root: &mdast::Node, resolver: &StyleResolver) -> Node {
build_node(root, resolver, &[], &Style::default())
}
fn build_node(
node: &mdast::Node,
resolver: &StyleResolver,
ancestor_tags: &[String],
parent_style: &Style,
) -> Node {
match node {
mdast::Node::Root(root) => {
let tag = "body";
let style = resolver.resolve_style(tag, &[], ancestor_tags, parent_style);
let mut new_ancestors = ancestor_tags.to_vec();
new_ancestors.push(tag.to_string());
let children =
build_html_aware_children(&root.children, resolver, &new_ancestors, &style);
Node::new(NodeKind::Document { children }, style, false)
}
mdast::Node::Paragraph(_para) => {
let tag = "p";
let style = resolver.resolve_style(tag, &[], ancestor_tags, parent_style);
let mut new_ancestors = ancestor_tags.to_vec();
new_ancestors.push(tag.to_string());
let children = build_inline_children(&_para.children, resolver, &new_ancestors, &style);
Node::new(NodeKind::Paragraph { children }, style, true)
}
mdast::Node::Heading(heading) => {
let tag = match heading.depth {
1 => "h1",
2 => "h2",
3 => "h3",
4 => "h4",
5 => "h5",
6 => "h6",
_ => "h1",
};
let style = resolver.resolve_style(tag, &[], ancestor_tags, parent_style);
let mut new_ancestors = ancestor_tags.to_vec();
new_ancestors.push(tag.to_string());
let children =
build_inline_children(&heading.children, resolver, &new_ancestors, &style);
Node::new(
NodeKind::Heading {
level: heading.depth,
children,
},
style,
false,
)
}
mdast::Node::Code(code) => {
let tag = "pre";
let style = resolver.resolve_style(tag, &[], ancestor_tags, parent_style);
Node::new(
NodeKind::CodeBlock {
code: code.value.clone(),
lang: code.lang.clone(),
},
style,
false,
)
}
mdast::Node::Image(image) => {
let tag = "img";
let style = resolver.resolve_style(tag, &[], ancestor_tags, parent_style);
Node::new(
NodeKind::Image {
src: image.url.clone(),
alt: image.alt.clone(),
title: image.title.clone(),
},
style,
false,
)
}
mdast::Node::List(list) => {
let tag = if list.ordered { "ol" } else { "ul" };
let style = resolver.resolve_style(tag, &[], ancestor_tags, parent_style);
let mut new_ancestors = ancestor_tags.to_vec();
new_ancestors.push(tag.to_string());
let children: Vec<Node> = list
.children
.iter()
.map(|child| build_node(child, resolver, &new_ancestors, &style))
.collect();
Node::new(
NodeKind::List {
ordered: list.ordered,
start: list.start,
children,
},
style,
true,
)
}
mdast::Node::ListItem(item) => {
let tag = "li";
let style = resolver.resolve_style(tag, &[], ancestor_tags, parent_style);
let mut new_ancestors = ancestor_tags.to_vec();
new_ancestors.push(tag.to_string());
let children: Vec<Node> = item
.children
.iter()
.map(|child| build_node(child, resolver, &new_ancestors, &style))
.collect();
match item.checked {
Some(checked) => {
Node::new(NodeKind::TaskListItem { checked, children }, style, true)
}
None => Node::new(NodeKind::ListItem { children }, style, true),
}
}
mdast::Node::Blockquote(blockquote) => {
let tag = "blockquote";
let style = resolver.resolve_style(tag, &[], ancestor_tags, parent_style);
let mut new_ancestors = ancestor_tags.to_vec();
new_ancestors.push(tag.to_string());
let children: Vec<Node> = blockquote
.children
.iter()
.map(|child| build_node(child, resolver, &new_ancestors, &style))
.collect();
Node::new(NodeKind::Blockquote { children }, style, true)
}
mdast::Node::ThematicBreak(_) => {
let tag = "hr";
let style = resolver.resolve_style(tag, &[], ancestor_tags, parent_style);
Node::new(NodeKind::ThematicBreak, style, false)
}
mdast::Node::Table(table) => {
let tag = "table";
let style = resolver.resolve_style(tag, &[], ancestor_tags, parent_style);
let mut new_ancestors = ancestor_tags.to_vec();
new_ancestors.push(tag.to_string());
let align: Vec<TextAlign> = table
.align
.iter()
.map(|a| match a {
mdast::AlignKind::Left => TextAlign::Left,
mdast::AlignKind::Right => TextAlign::Right,
mdast::AlignKind::Center => TextAlign::Center,
mdast::AlignKind::None => TextAlign::Left,
})
.collect();
let children: Vec<Node> = table
.children
.iter()
.map(|child| build_node(child, resolver, &new_ancestors, &style))
.collect();
Node::new(NodeKind::Table { children, align }, style, false)
}
mdast::Node::TableRow(row) => {
let tag = "tr";
let style = resolver.resolve_style(tag, &[], ancestor_tags, parent_style);
let mut new_ancestors = ancestor_tags.to_vec();
new_ancestors.push(tag.to_string());
let children: Vec<Node> = row
.children
.iter()
.map(|child| build_node(child, resolver, &new_ancestors, &style))
.collect();
Node::new(NodeKind::TableRow { children }, style, false)
}
mdast::Node::TableCell(cell) => {
let tag = "td";
let style = resolver.resolve_style(tag, &[], ancestor_tags, parent_style);
let mut new_ancestors = ancestor_tags.to_vec();
new_ancestors.push(tag.to_string());
let children = build_inline_children(&cell.children, resolver, &new_ancestors, &style);
Node::new(NodeKind::Paragraph { children }, style, true)
}
mdast::Node::Text(text) => {
let tag = "span";
let style = resolver.resolve_style(tag, &[], ancestor_tags, parent_style);
Node::new(
NodeKind::Text {
text: text.value.clone(),
},
style,
true,
)
}
mdast::Node::Strong(strong) => {
let tag = "strong";
let style = resolver.resolve_style(tag, &[], ancestor_tags, parent_style);
let mut new_ancestors = ancestor_tags.to_vec();
new_ancestors.push(tag.to_string());
let children: Vec<Node> = strong
.children
.iter()
.map(|child| build_node(child, resolver, &new_ancestors, &style))
.collect();
Node::new(NodeKind::Strong { children }, style, true)
}
mdast::Node::Emphasis(emph) => {
let tag = "em";
let style = resolver.resolve_style(tag, &[], ancestor_tags, parent_style);
let mut new_ancestors = ancestor_tags.to_vec();
new_ancestors.push(tag.to_string());
let children: Vec<Node> = emph
.children
.iter()
.map(|child| build_node(child, resolver, &new_ancestors, &style))
.collect();
Node::new(NodeKind::Emphasis { children }, style, true)
}
mdast::Node::InlineCode(code) => {
let tag = "code";
let style = resolver.resolve_style(tag, &[], ancestor_tags, parent_style);
Node::new(
NodeKind::InlineCode {
code: code.value.clone(),
},
style,
true,
)
}
mdast::Node::Link(link) => {
let tag = "a";
let style = resolver.resolve_style(tag, &[], ancestor_tags, parent_style);
let mut style = style;
style.link_url = Some(link.url.clone());
let mut new_ancestors = ancestor_tags.to_vec();
new_ancestors.push(tag.to_string());
let children: Vec<Node> = link
.children
.iter()
.map(|child| build_node(child, resolver, &new_ancestors, &style))
.collect();
Node::new(
NodeKind::Link {
url: link.url.clone(),
title: link.title.clone(),
children,
},
style,
true,
)
}
mdast::Node::Delete(del) => {
let tag = "del";
let style = resolver.resolve_style(tag, &[], ancestor_tags, parent_style);
let mut new_ancestors = ancestor_tags.to_vec();
new_ancestors.push(tag.to_string());
let children: Vec<Node> = del
.children
.iter()
.map(|child| build_node(child, resolver, &new_ancestors, &style))
.collect();
Node::new(NodeKind::Delete { children }, style, true)
}
mdast::Node::Html(html) => match classify_html(&html.value) {
HtmlClassification::SelfClosing(info) => match info.tag.as_str() {
"br" => Node::new(NodeKind::ThematicBreak, Style::default(), false),
_ => Node::new(
NodeKind::Text {
text: String::new(),
},
Style::default(),
false,
),
},
HtmlClassification::Container(info, inner_content) => {
let style =
resolver.resolve_style(&info.tag, &info.classes, ancestor_tags, parent_style);
let mut style = style;
if let Some(inline_css) = &info.inline_style {
resolver.apply_inline_style(&mut style, inline_css);
}
let mut new_ancestors = ancestor_tags.to_vec();
new_ancestors.push(info.tag.clone());
let inner_mdast =
markdown::to_mdast(&inner_content, &markdown::ParseOptions::default());
let inner_children = match inner_mdast {
Ok(mdast::Node::Root(root)) => {
build_html_aware_children(&root.children, resolver, &new_ancestors, &style)
}
_ => vec![],
};
match info.tag.as_str() {
"center" => Node::new(
NodeKind::Center {
children: inner_children,
},
style,
true,
),
_ => Node::new(
NodeKind::Text {
text: String::new(),
},
Style::default(),
false,
),
}
}
HtmlClassification::OpenTag(info) => {
let style =
resolver.resolve_style(&info.tag, &info.classes, ancestor_tags, parent_style);
let mut style = style;
if let Some(inline_css) = &info.inline_style {
resolver.apply_inline_style(&mut style, inline_css);
}
match info.tag.as_str() {
"center" => Node::new(NodeKind::Center { children: vec![] }, style, true),
"span" => Node::new(NodeKind::Span { children: vec![] }, style, true),
_ => Node::new(
NodeKind::Text {
text: String::new(),
},
Style::default(),
false,
),
}
}
HtmlClassification::CloseTag(_) | HtmlClassification::Text => Node::new(
NodeKind::Text {
text: String::new(),
},
Style::default(),
false,
),
},
_ => Node::new(
NodeKind::Text {
text: String::new(),
},
Style::default(),
true,
),
}
}
fn build_inline_children(
children: &[mdast::Node],
resolver: &StyleResolver,
ancestor_tags: &[String],
parent_style: &Style,
) -> Vec<Node> {
build_inline_html_aware_children(children, resolver, ancestor_tags, parent_style)
}
pub fn walk<F>(node: &Node, callback: &mut F)
where
F: FnMut(&Node),
{
callback(node);
match &node.kind {
NodeKind::Document { children }
| NodeKind::Heading { children, .. }
| NodeKind::Paragraph { children }
| NodeKind::List { children, .. }
| NodeKind::ListItem { children }
| NodeKind::TaskListItem { children, .. }
| NodeKind::Blockquote { children }
| NodeKind::Table { children, .. }
| NodeKind::TableRow { children }
| NodeKind::Strong { children }
| NodeKind::Emphasis { children }
| NodeKind::Link { children, .. }
| NodeKind::Delete { children }
| NodeKind::Span { children }
| NodeKind::Center { children } => {
for child in children {
walk(child, callback);
}
}
_ => {}
}
}
pub fn collect_text(node: &Node) -> String {
let mut result = String::new();
walk(node, &mut |n| {
if let NodeKind::Text { text } = &n.kind {
result.push_str(text);
}
});
result
}
#[derive(Debug, Clone)]
pub(crate) struct TagInfo {
pub tag: String,
pub classes: Vec<String>,
#[allow(dead_code)]
pub id: Option<String>,
pub inline_style: Option<String>,
pub is_close: bool,
pub is_self_closing: bool,
}
enum HtmlClassification {
SelfClosing(TagInfo),
OpenTag(TagInfo),
CloseTag(TagInfo),
Container(TagInfo, String),
Text,
}
fn classify_html(html: &str) -> HtmlClassification {
use std::cell::RefCell;
use std::rc::Rc;
use html5ever::tendril::StrTendril;
use html5ever::tokenizer::{
BufferQueue, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer,
};
let html = html.trim();
if !html.starts_with('<') {
return HtmlClassification::Text;
}
#[derive(Debug, Clone)]
enum SimpleToken {
StartTag { name: String, self_closing: bool },
EndTag { name: String },
Other,
}
struct ClassifySink {
tokens: Rc<RefCell<Vec<SimpleToken>>>,
}
impl TokenSink for ClassifySink {
type Handle = ();
fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
match token {
Token::TagToken(tag) => match tag.kind {
TagKind::StartTag => {
self.tokens.borrow_mut().push(SimpleToken::StartTag {
name: tag.name.to_string(),
self_closing: tag.self_closing,
});
}
TagKind::EndTag => {
self.tokens.borrow_mut().push(SimpleToken::EndTag {
name: tag.name.to_string(),
});
}
},
_ => {
self.tokens.borrow_mut().push(SimpleToken::Other);
}
}
TokenSinkResult::Continue
}
}
let tokens_rc = Rc::new(RefCell::new(Vec::new()));
let sink = ClassifySink {
tokens: tokens_rc.clone(),
};
let tokenizer = Tokenizer::new(sink, Default::default());
let mut input = BufferQueue::default();
input.push_back(StrTendril::from(html));
let _ = tokenizer.feed(&input);
tokenizer.end();
let tokens = tokens_rc.take();
let first_tag_idx = match tokens.iter().position(|t| !matches!(t, SimpleToken::Other)) {
Some(i) => i,
None => return HtmlClassification::Text,
};
let first_tag = &tokens[first_tag_idx];
match first_tag {
SimpleToken::StartTag {
name: _,
self_closing: true,
} => {
HtmlClassification::SelfClosing(parse_html_tag(html).unwrap())
}
SimpleToken::StartTag {
name,
self_closing: false,
} => {
let mut depth: i32 = 1;
let mut end_idx = None;
for (i, t) in tokens.iter().enumerate().skip(first_tag_idx + 1) {
match t {
SimpleToken::StartTag { name: n, .. } if n == name => depth += 1,
SimpleToken::EndTag { name: n } if n == name => {
depth -= 1;
if depth == 0 {
end_idx = Some(i);
break;
}
}
_ => {}
}
}
if end_idx.is_some() {
let tag_start = format!("<{}", name);
let name_start = match html.find(&tag_start) {
Some(pos) => pos,
None => return HtmlClassification::Text,
};
let after_open = match html[name_start..].find('>') {
Some(pos) => pos,
None => return HtmlClassification::Text,
};
let content_start = name_start + after_open + 1;
let close_tag = format!("</{}>", name);
let content_end = match html[content_start..].rfind(&close_tag) {
Some(pos) => pos,
None => return HtmlClassification::Text,
};
let inner = html[content_start..content_start + content_end].to_string();
let info = match parse_html_tag(&html[name_start..=name_start + after_open]) {
Some(info) => info,
None => return HtmlClassification::Text,
};
HtmlClassification::Container(info, inner)
} else {
HtmlClassification::OpenTag(parse_html_tag(html).unwrap())
}
}
SimpleToken::EndTag { name } => HtmlClassification::CloseTag(TagInfo {
tag: name.clone(),
classes: vec![],
id: None,
inline_style: None,
is_close: true,
is_self_closing: false,
}),
_ => HtmlClassification::Text,
}
}
pub(crate) fn parse_html_tag(html: &str) -> Option<TagInfo> {
use std::cell::RefCell;
use std::rc::Rc;
use html5ever::tendril::StrTendril;
use html5ever::tokenizer::TagKind;
use html5ever::tokenizer::{BufferQueue, Tag, Token, TokenSink, TokenSinkResult, Tokenizer};
let html = html.trim();
if !html.starts_with('<') || !html.ends_with('>') {
return None;
}
struct HtmlTagSink {
result: Rc<RefCell<Option<TagInfo>>>,
}
impl TokenSink for HtmlTagSink {
type Handle = ();
fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
if let Token::TagToken(tag) = token {
match tag.kind {
TagKind::StartTag => {
let info = tag_to_info(tag);
*self.result.borrow_mut() = Some(info);
}
TagKind::EndTag => {
*self.result.borrow_mut() = Some(TagInfo {
tag: tag.name.to_string(),
classes: vec![],
id: None,
inline_style: None,
is_close: true,
is_self_closing: false,
});
}
}
}
TokenSinkResult::Continue
}
}
fn tag_to_info(tag: Tag) -> TagInfo {
let mut classes: Vec<String> = Vec::new();
let mut id: Option<String> = None;
let mut inline_style: Option<String> = None;
for attr in tag.attrs {
let name: String = attr.name.local.to_string();
match name.to_lowercase().as_str() {
"class" => {
classes = attr
.value
.to_string()
.split_whitespace()
.map(|s| s.to_string())
.collect();
}
"id" => {
id = Some(attr.value.to_string());
}
"style" => {
inline_style = Some(attr.value.to_string());
}
_ => {}
}
}
TagInfo {
tag: tag.name.to_string(),
classes,
id,
inline_style,
is_close: false,
is_self_closing: tag.self_closing,
}
}
let result = Rc::new(RefCell::new(None));
let sink = HtmlTagSink {
result: result.clone(),
};
let tokenizer = Tokenizer::new(sink, Default::default());
let mut input = BufferQueue::default();
input.push_back(StrTendril::from(html));
let _ = tokenizer.feed(&input);
tokenizer.end();
let borrowed = result.borrow();
borrowed.clone()
}
fn build_html_aware_children(
nodes: &[mdast::Node],
resolver: &StyleResolver,
ancestor_tags: &[String],
parent_style: &Style,
) -> Vec<Node> {
let mut result: Vec<Node> = Vec::new();
let mut i = 0;
while i < nodes.len() {
let node = &nodes[i];
if let mdast::Node::Html(html) = node
&& let Some(info) = parse_html_tag(&html.value)
&& !info.is_close
&& !info.is_self_closing
{
let is_container = matches!(info.tag.as_str(), "center");
if is_container {
let mut inner_nodes: Vec<mdast::Node> = Vec::new();
let mut depth = 1;
let mut j = i + 1;
while j < nodes.len() && depth > 0 {
if let mdast::Node::Html(inner_html) = &nodes[j]
&& let Some(inner_info) = parse_html_tag(&inner_html.value)
{
if inner_info.is_close && inner_info.tag == info.tag {
depth -= 1;
if depth == 0 {
j += 1;
break;
}
} else if !inner_info.is_close
&& !inner_info.is_self_closing
&& inner_info.tag == info.tag
{
depth += 1;
}
}
if depth > 0 {
inner_nodes.push(nodes[j].clone());
}
j += 1;
}
let style =
resolver.resolve_style("center", &info.classes, ancestor_tags, parent_style);
let mut style = style;
if let Some(inline_css) = &info.inline_style {
resolver.apply_inline_style(&mut style, inline_css);
}
let mut new_ancestors = ancestor_tags.to_vec();
new_ancestors.push("center".to_string());
let children =
build_html_aware_children(&inner_nodes, resolver, &new_ancestors, &style);
let node = Node::new(NodeKind::Center { children }, style, true);
result.push(node);
i = j;
continue;
}
}
result.push(build_node(node, resolver, ancestor_tags, parent_style));
i += 1;
}
result
}
pub(crate) fn build_inline_html_aware_children(
nodes: &[mdast::Node],
resolver: &StyleResolver,
ancestor_tags: &[String],
parent_style: &Style,
) -> Vec<Node> {
let mut result: Vec<Node> = Vec::new();
let mut i = 0;
while i < nodes.len() {
let node = &nodes[i];
if let mdast::Node::Html(html) = node
&& let Some(info) = parse_html_tag(&html.value)
&& !info.is_close
&& !info.is_self_closing
&& info.tag == "span"
{
let mut inner_nodes: Vec<mdast::Node> = Vec::new();
let mut j = i + 1;
let mut found_close = false;
while j < nodes.len() {
if let mdast::Node::Html(inner_html) = &nodes[j]
&& let Some(inner_info) = parse_html_tag(&inner_html.value)
&& inner_info.is_close
&& inner_info.tag == "span"
{
found_close = true;
j += 1;
break;
}
inner_nodes.push(nodes[j].clone());
j += 1;
}
if found_close {
let style =
resolver.resolve_style("span", &info.classes, ancestor_tags, parent_style);
let mut style = style;
if let Some(inline_css) = &info.inline_style {
resolver.apply_inline_style(&mut style, inline_css);
}
let mut new_ancestors = ancestor_tags.to_vec();
new_ancestors.push("span".to_string());
let children = build_inline_html_aware_children(
&inner_nodes,
resolver,
&new_ancestors,
&style,
);
result.push(Node::new(NodeKind::Span { children }, style, true));
i = j;
continue;
}
}
result.push(build_node(node, resolver, ancestor_tags, parent_style));
i += 1;
}
result
}