use html5ever::LocalName;
use super::arena::{ArenaDom, ArenaNodeData, ArenaNodeId};
use super::css::{Origin, Stylesheet, compute_styles};
use super::element_ref::ElementRef;
use crate::ir::{ComputedStyle, Display, IRChapter, Node, NodeId, Role};
pub fn user_agent_stylesheet() -> Stylesheet {
Stylesheet::parse(
r#"
/* Block elements */
html, body, div, section, article, aside, nav, header, footer, main,
address, blockquote, figure, figcaption, details, summary, p, pre, hr, hgroup {
display: block;
}
/* Headings */
h1, h2, h3, h4, h5, h6 {
display: block;
font-weight: bold;
}
h1 { font-size: 2em; }
h2 { font-size: 1.5em; }
h3 { font-size: 1.17em; }
h4 { font-size: 1em; }
h5 { font-size: 0.83em; }
h6 { font-size: 0.67em; }
/* Lists - display and list-style only, no padding/margins */
ul, ol { display: block; }
ul { list-style-type: disc; }
ol { list-style-type: decimal; }
li { display: list-item; }
/* Inline elements */
span, a, em, i, strong, b, cite, var, dfn, abbr, acronym,
code, kbd, samp, tt, sub, sup, small, big, q,
u, ins, s, strike, del, mark, time, label, img {
display: inline;
}
/* Semantic inline styles */
em, i, cite, var, dfn { font-style: italic; }
strong, b { font-weight: bold; }
code, kbd, samp, tt, pre { font-family: monospace; }
sup { vertical-align: super; }
sub { vertical-align: sub; }
u, ins { text-decoration: underline; }
s, strike, del { text-decoration: line-through; }
/* Tables */
table { display: table; }
tr { display: table-row; }
td, th { display: table-cell; }
th { font-weight: bold; }
/* Hidden elements */
head, script, style, link, meta, title, template {
display: none;
}
"#,
)
}
fn map_element_to_role(local_name: &LocalName) -> Role {
match local_name.as_ref() {
"div" | "section" | "article" | "nav" | "header" | "footer" | "main" | "address"
| "details" | "summary" | "hgroup" => Role::Container,
"br" => Role::Break,
"hr" => Role::Rule,
"aside" => Role::Sidebar,
"figure" => Role::Figure,
"figcaption" | "caption" => Role::Caption,
"p" => Role::Paragraph,
"pre" => Role::CodeBlock,
"span" | "em" | "i" | "cite" | "var" | "dfn" | "strong" | "b" | "code" | "kbd" | "samp"
| "tt" | "sup" | "sub" | "u" | "ins" | "s" | "strike" | "del" | "small" | "mark"
| "abbr" | "time" | "q" => Role::Inline,
"h1" => Role::Heading(1),
"h2" => Role::Heading(2),
"h3" => Role::Heading(3),
"h4" => Role::Heading(4),
"h5" => Role::Heading(5),
"h6" => Role::Heading(6),
"a" => Role::Link,
"img" => Role::Image,
"ul" => Role::UnorderedList,
"ol" => Role::OrderedList,
"li" => Role::ListItem,
"blockquote" => Role::BlockQuote,
"dl" => Role::DefinitionList,
"dt" => Role::DefinitionTerm,
"dd" => Role::DefinitionDescription,
"table" => Role::Table,
"tr" => Role::TableRow,
"td" | "th" => Role::TableCell,
"label" | "legend" | "output" | "data" | "ruby" | "rt" | "rp" | "bdi" | "bdo" | "wbr" => {
Role::Inline
}
_ => Role::Container,
}
}
struct TransformContext<'a> {
dom: &'a ArenaDom,
stylesheets: &'a [(Stylesheet, Origin)],
chapter: IRChapter,
node_map: std::collections::HashMap<ArenaNodeId, NodeId>,
}
impl<'a> TransformContext<'a> {
fn new(dom: &'a ArenaDom, stylesheets: &'a [(Stylesheet, Origin)]) -> Self {
Self {
dom,
stylesheets,
chapter: IRChapter::new(),
node_map: std::collections::HashMap::new(),
}
}
fn transform(mut self) -> IRChapter {
let body = self.dom.find_by_tag("body").unwrap_or(self.dom.document());
let html_lang = self.dom.find_by_tag("html").and_then(|html_id| {
if let Some(node) = self.dom.get(html_id)
&& let ArenaNodeData::Element { attrs, .. } = &node.data
{
for attr in attrs {
if attr.name.local.as_ref() == "lang" && !attr.value.is_empty() {
return Some(attr.value.clone());
}
}
}
None
});
let mut body_style = {
let elem_ref = ElementRef::new(self.dom, body);
compute_styles(elem_ref, self.stylesheets, None, &mut self.chapter.styles)
};
if let Some(lang) = html_lang
&& body_style.language.is_none()
{
body_style.language = Some(lang);
}
self.process_children(body, NodeId::ROOT, Some(&body_style));
self.chapter
}
fn process_children(
&mut self,
dom_parent: ArenaNodeId,
ir_parent: NodeId,
parent_style: Option<&ComputedStyle>,
) {
for child_id in self.dom.children(dom_parent).collect::<Vec<_>>() {
self.process_node(child_id, ir_parent, parent_style);
}
}
fn process_node(
&mut self,
dom_id: ArenaNodeId,
ir_parent: NodeId,
parent_style: Option<&ComputedStyle>,
) {
let node = match self.dom.get(dom_id) {
Some(n) => n,
None => return,
};
match &node.data {
ArenaNodeData::Text(text) => {
if text.trim().is_empty() {
let has_newlines = text.contains('\n');
let is_block_parent = parent_style
.map(|s| s.display != Display::Inline)
.unwrap_or(true);
if has_newlines && is_block_parent {
return;
}
if parent_style.is_none() {
return;
}
let range = self.chapter.append_text(" ");
let text_node = Node::text(range);
let ir_id = self.chapter.alloc_node(text_node);
self.chapter.append_child(ir_parent, ir_id);
self.node_map.insert(dom_id, ir_id);
return;
}
let range = self.chapter.append_text(text);
let text_node = Node::text(range);
let ir_id = self.chapter.alloc_node(text_node);
self.chapter.append_child(ir_parent, ir_id);
self.node_map.insert(dom_id, ir_id);
}
ArenaNodeData::Element { name, attrs, .. } => {
let elem_ref = ElementRef::new(self.dom, dom_id);
let mut computed = compute_styles(
elem_ref,
self.stylesheets,
parent_style,
&mut self.chapter.styles,
);
for attr in attrs {
if attr.name.local.as_ref() == "lang" && !attr.value.is_empty() {
computed.language = Some(attr.value.to_string());
break;
}
}
let role = map_element_to_role(&name.local);
if computed.display == Display::None && role != Role::Break {
return;
}
let mut ir_node = Node::new(role);
ir_node.style = self.chapter.styles.intern(computed.clone());
let ir_id = self.chapter.alloc_node(ir_node);
self.chapter.append_child(ir_parent, ir_id);
self.node_map.insert(dom_id, ir_id);
for attr in attrs {
let attr_name = attr.name.local.as_ref();
let attr_ns = attr.name.ns.as_ref();
match attr_name {
"href" => {
self.chapter.semantics.set_href(ir_id, attr.value.clone());
}
"src" => self.chapter.semantics.set_src(ir_id, attr.value.clone()),
"alt" => self.chapter.semantics.set_alt(ir_id, attr.value.clone()),
"id" => self.chapter.semantics.set_id(ir_id, attr.value.clone()),
"title" => self.chapter.semantics.set_title(ir_id, attr.value.clone()),
"lang" => self.chapter.semantics.set_lang(ir_id, attr.value.clone()),
"start" if name.local.as_ref() == "ol" => {
if let Ok(start) = attr.value.parse::<u32>() {
self.chapter.semantics.set_list_start(ir_id, start);
}
}
"type" if attr_ns == "http://www.idpf.org/2007/ops" => {
self.chapter
.semantics
.set_epub_type(ir_id, attr.value.clone());
}
"epub:type" => {
self.chapter
.semantics
.set_epub_type(ir_id, attr.value.clone());
}
"role" => {
self.chapter
.semantics
.set_aria_role(ir_id, attr.value.clone());
}
"datetime" => {
self.chapter
.semantics
.set_datetime(ir_id, attr.value.clone());
}
"rowspan" if matches!(name.local.as_ref(), "td" | "th") => {
if let Ok(span) = attr.value.parse::<u32>() {
self.chapter.semantics.set_row_span(ir_id, span);
}
}
"colspan" if matches!(name.local.as_ref(), "td" | "th") => {
if let Ok(span) = attr.value.parse::<u32>() {
self.chapter.semantics.set_col_span(ir_id, span);
}
}
"class" if matches!(name.local.as_ref(), "code" | "pre") => {
for class in attr.value.split_whitespace() {
if let Some(lang) = class.strip_prefix("language-") {
self.chapter.semantics.set_language(ir_id, lang.to_string());
break;
}
if let Some(lang) = class.strip_prefix("lang-") {
self.chapter.semantics.set_language(ir_id, lang.to_string());
break;
}
}
}
_ => {}
}
}
if name.local.as_ref() == "th" {
self.chapter.semantics.set_header_cell(ir_id, true);
}
self.process_children(dom_id, ir_id, Some(&computed));
}
ArenaNodeData::Document | ArenaNodeData::Comment(_) | ArenaNodeData::Doctype { .. } => {
}
}
}
}
pub fn transform(dom: &ArenaDom, stylesheets: &[(Stylesheet, Origin)]) -> IRChapter {
let ctx = TransformContext::new(dom, stylesheets);
ctx.transform()
}
#[cfg(test)]
mod tests {
use html5ever::driver::ParseOpts;
use html5ever::parse_document;
use html5ever::tendril::TendrilSink;
use super::*;
use crate::compiler::tree_sink::ArenaSink;
fn parse_html(html: &str) -> ArenaDom {
let sink = ArenaSink::new();
let result = parse_document(sink, ParseOpts::default())
.from_utf8()
.one(html.as_bytes());
result.into_dom()
}
#[test]
fn test_basic_transform() {
let dom = parse_html("<html><body><p>Hello, World!</p></body></html>");
let ua = user_agent_stylesheet();
let stylesheets = vec![(ua, Origin::UserAgent)];
let chapter = transform(&dom, &stylesheets);
assert!(chapter.node_count() >= 3);
let mut found_text = false;
for id in chapter.iter_dfs() {
let node = chapter.node(id).unwrap();
if node.role == Role::Text && !node.text.is_empty() {
found_text = true;
let text = chapter.text(node.text);
assert!(text.contains("Hello"));
}
}
assert!(found_text);
}
#[test]
fn test_heading_levels() {
let dom = parse_html("<html><body><h1>Title</h1><h2>Subtitle</h2></body></html>");
let ua = user_agent_stylesheet();
let stylesheets = vec![(ua, Origin::UserAgent)];
let chapter = transform(&dom, &stylesheets);
let mut h1_count = 0;
let mut h2_count = 0;
for id in chapter.iter_dfs() {
match chapter.node(id).unwrap().role {
Role::Heading(1) => h1_count += 1,
Role::Heading(2) => h2_count += 1,
_ => {}
}
}
assert_eq!(h1_count, 1);
assert_eq!(h2_count, 1);
}
#[test]
fn test_link_semantics() {
let dom = parse_html(r#"<a href="https://example.com">Link</a>"#);
let ua = user_agent_stylesheet();
let stylesheets = vec![(ua, Origin::UserAgent)];
let chapter = transform(&dom, &stylesheets);
for id in chapter.iter_dfs() {
if chapter.node(id).unwrap().role == Role::Link {
assert_eq!(chapter.semantics.href(id), Some("https://example.com"));
return;
}
}
panic!("Link not found");
}
#[test]
fn test_style_inheritance() {
let dom = parse_html(
r#"<html><body>
<div style="color: red;"><p>Inherited</p></div>
</body></html>"#,
);
let ua = user_agent_stylesheet();
let author = Stylesheet::parse("div { color: red; }");
let stylesheets = vec![(ua, Origin::UserAgent), (author, Origin::Author)];
let chapter = transform(&dom, &stylesheets);
assert!(chapter.node_count() > 1);
}
#[test]
fn test_hidden_elements() {
let dom = parse_html(
r#"<html><head><title>Test</title></head><body><p>Visible</p></body></html>"#,
);
let ua = user_agent_stylesheet();
let stylesheets = vec![(ua, Origin::UserAgent)];
let chapter = transform(&dom, &stylesheets);
for id in chapter.iter_dfs() {
let node = chapter.node(id).unwrap();
if node.role == Role::Text {
let text = chapter.text(node.text);
assert!(!text.contains("Test"));
}
}
}
#[test]
fn test_br_element() {
let dom = parse_html(r#"<html><body><p>Line one<br/>Line two</p></body></html>"#);
let ua = user_agent_stylesheet();
let stylesheets = vec![(ua, Origin::UserAgent)];
let chapter = transform(&dom, &stylesheets);
let mut found_break = false;
for id in chapter.iter_dfs() {
if chapter.node(id).unwrap().role == Role::Break {
found_break = true;
break;
}
}
assert!(found_break, "Break node not found");
}
#[test]
fn test_br_element_xhtml_style() {
let dom = parse_html(
r#"<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<body><p><span>Line one</span><br/><span>Line two</span></p></body></html>"#,
);
let ua = user_agent_stylesheet();
let stylesheets = vec![(ua, Origin::UserAgent)];
let chapter = transform(&dom, &stylesheets);
let mut found_break = false;
for id in chapter.iter_dfs() {
if chapter.node(id).unwrap().role == Role::Break {
found_break = true;
break;
}
}
assert!(found_break, "Break node not found in XHTML-style input");
}
#[test]
fn test_br_in_blockquote_verse() {
let dom = parse_html(
r#"<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<blockquote>
<p lang="la">
<span>Cui non conveniet sua res, ut calceus olim,</span>
<br/>
<span>Si pede major erit, subvertet; si minor, uret.</span>
</p>
</blockquote>
</body></html>"#,
);
let ua = user_agent_stylesheet();
let stylesheets = vec![(ua, Origin::UserAgent)];
let chapter = transform(&dom, &stylesheets);
let mut found_break = false;
for id in chapter.iter_dfs() {
if chapter.node(id).unwrap().role == Role::Break {
found_break = true;
break;
}
}
assert!(found_break, "Break node not found in blockquote verse");
}
}