mod arena;
mod css;
mod element_ref;
pub mod optimizer;
mod transform;
mod tree_sink;
pub use arena::{ArenaDom, ArenaNode, ArenaNodeData, ArenaNodeId};
pub use css::{Declaration, Origin, PropertyValue, Specificity, Stylesheet};
pub use element_ref::{BokoSelectors, ElementRef};
pub use optimizer::optimize;
pub use transform::user_agent_stylesheet;
use html5ever::driver::ParseOpts;
use html5ever::parse_document;
use html5ever::tendril::TendrilSink;
use crate::ir::IRChapter;
use tree_sink::ArenaSink;
pub fn compile_html(html: &str, author_stylesheets: &[(Stylesheet, Origin)]) -> IRChapter {
let sink = ArenaSink::new();
let result = parse_document(sink, ParseOpts::default())
.from_utf8()
.one(html.as_bytes());
let dom = result.into_dom();
let ua = transform::user_agent_stylesheet();
let mut all_stylesheets: Vec<(Stylesheet, Origin)> = vec![(ua, Origin::UserAgent)];
for (sheet, origin) in author_stylesheets {
all_stylesheets.push((sheet.clone(), *origin));
}
let mut chapter = transform::transform(&dom, &all_stylesheets);
optimizer::optimize(&mut chapter);
chapter
}
pub fn compile_html_bytes(html: &[u8], author_stylesheets: &[(Stylesheet, Origin)]) -> IRChapter {
let hint_encoding = crate::util::extract_xml_encoding(html);
let html_str = crate::util::decode_text(html, hint_encoding);
compile_html(&html_str, author_stylesheets)
}
pub fn extract_stylesheets(html: &str) -> (Vec<String>, Vec<String>) {
let sink = ArenaSink::new();
let result = parse_document(sink, ParseOpts::default())
.from_utf8()
.one(html.as_bytes());
let dom = result.into_dom();
let mut linked = Vec::new();
let mut inline = Vec::new();
let mut stack = vec![dom.document()];
while let Some(id) = stack.pop() {
if let Some(node) = dom.get(id)
&& let ArenaNodeData::Element { name, attrs, .. } = &node.data
{
match name.local.as_ref() {
"link" => {
let is_stylesheet = attrs
.iter()
.any(|a| a.name.local.as_ref() == "rel" && a.value == "stylesheet");
if is_stylesheet
&& let Some(href) = attrs
.iter()
.find(|a| a.name.local.as_ref() == "href")
.map(|a| a.value.clone())
{
linked.push(href);
}
}
"style" => {
let mut text = String::new();
for child in dom.children(id) {
if let Some(t) = dom.text_content(child) {
text.push_str(t);
}
}
if !text.trim().is_empty() {
inline.push(text);
}
}
_ => {}
}
}
let children: Vec<_> = dom.children(id).collect();
for child in children.into_iter().rev() {
stack.push(child);
}
}
(linked, inline)
}
pub fn resolve_path(base: &str, rel: &str) -> String {
use std::path::{Component, Path};
let rel_path = Path::new(rel);
if rel_path.has_root() {
return rel.trim_start_matches('/').to_string();
}
if rel.contains("://") || rel.starts_with("data:") {
return rel.to_string();
}
let base_path = Path::new(base);
let mut stack: Vec<&str> = base_path
.parent()
.unwrap_or(Path::new(""))
.components()
.filter_map(|c| {
if let Component::Normal(s) = c {
s.to_str()
} else {
None
}
})
.collect();
for component in rel_path.components() {
match component {
Component::ParentDir => {
stack.pop(); }
Component::Normal(c) => {
if let Some(s) = c.to_str() {
stack.push(s);
}
}
Component::CurDir => {} _ => {}
}
}
stack.join("/")
}
#[cfg(test)]
mod tests {
use super::*;
use crate::ir::Role;
#[test]
fn test_compile_simple_html() {
let html = "<html><body><p>Test paragraph</p></body></html>";
let chapter = compile_html(html, &[]);
assert!(chapter.node_count() >= 3);
let mut found_text = false;
for id in chapter.iter_dfs() {
if chapter.node(id).unwrap().role == Role::Text {
found_text = true;
}
}
assert!(found_text);
}
#[test]
fn test_compile_with_css() {
let html = "<p class='highlight'>Styled</p>";
let css = ".highlight { font-weight: bold; }";
let author = Stylesheet::parse(css);
let chapter = compile_html(html, &[(author, Origin::Author)]);
for id in chapter.iter_dfs() {
let node = chapter.node(id).unwrap();
if node.role == Role::Paragraph {
let style = chapter.styles.get(node.style).unwrap();
if style.font_weight == crate::ir::FontWeight::BOLD {
return; }
}
}
panic!("Styled paragraph not found");
}
#[test]
fn test_extract_stylesheets() {
let html = r#"
<html>
<head>
<link rel="stylesheet" href="styles.css">
<link rel="stylesheet" href="theme.css">
<style>p { color: red; }</style>
</head>
<body><p>Content</p></body>
</html>
"#;
let (linked, inline) = extract_stylesheets(html);
assert_eq!(linked.len(), 2);
assert!(linked.contains(&"styles.css".to_string()));
assert!(linked.contains(&"theme.css".to_string()));
assert_eq!(inline.len(), 1);
assert!(inline[0].contains("color: red"));
}
#[test]
fn test_compile_html_bytes() {
let html = b"<p>Bytes test</p>";
let chapter = compile_html_bytes(html, &[]);
assert!(chapter.node_count() > 1);
}
#[test]
fn test_resolve_path_parent_dir() {
assert_eq!(
resolve_path("OEBPS/text/ch1.html", "../images/logo.png"),
"OEBPS/images/logo.png"
);
}
#[test]
fn test_resolve_path_same_dir() {
assert_eq!(
resolve_path("OEBPS/content.html", "images/photo.jpg"),
"OEBPS/images/photo.jpg"
);
}
#[test]
fn test_resolve_path_absolute() {
assert_eq!(
resolve_path("ch1.html", "/images/absolute.png"),
"images/absolute.png"
);
}
#[test]
fn test_resolve_path_multiple_parent() {
assert_eq!(
resolve_path("a/b/c/file.html", "../../images/test.png"),
"a/images/test.png"
);
}
#[test]
fn test_resolve_path_current_dir() {
assert_eq!(
resolve_path("OEBPS/ch1.html", "./images/test.png"),
"OEBPS/images/test.png"
);
}
#[test]
fn test_optimizer_merges_sibling_text_nodes() {
let html = r#"
<html><body>
<p>Hello, <b>World</b>!</p>
</body></html>
"#;
let chapter = compile_html(html, &[]);
let mut text_content = String::new();
for id in chapter.iter_dfs() {
let node = chapter.node(id).unwrap();
if node.role == Role::Text && !node.text.is_empty() {
text_content.push_str(chapter.text(node.text));
}
}
assert!(
text_content.contains("Hello"),
"Missing 'Hello' in: {}",
text_content
);
assert!(
text_content.contains("World"),
"Missing 'World' in: {}",
text_content
);
}
#[test]
fn test_optimizer_preserves_tree_structure() {
let html = r#"
<html><body>
<p>First paragraph</p>
<p>Second paragraph</p>
</body></html>
"#;
let chapter = compile_html(html, &[]);
let mut text_content = String::new();
for id in chapter.iter_dfs() {
let node = chapter.node(id).unwrap();
if node.role == Role::Text && !node.text.is_empty() {
text_content.push_str(chapter.text(node.text));
}
}
assert!(
text_content.contains("First paragraph"),
"Missing 'First paragraph' in: {}",
text_content
);
assert!(
text_content.contains("Second paragraph"),
"Missing 'Second paragraph' in: {}",
text_content
);
}
#[test]
fn test_resolve_path_url_passthrough() {
assert_eq!(
resolve_path("ch1.html", "https://example.com/image.png"),
"https://example.com/image.png"
);
assert_eq!(
resolve_path("ch1.html", "data:image/png;base64,abc"),
"data:image/png;base64,abc"
);
}
#[test]
fn test_br_survives_optimizer() {
let chapter = compile_html(
r#"<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<blockquote>
<p>
<span>Line 1</span>
<br/>
<span>Line 2</span>
</p>
</blockquote>
</body></html>"#,
&[],
);
let mut found_break = false;
for id in chapter.iter_dfs() {
if chapter.node(id).unwrap().role == Role::Break {
found_break = true;
break;
}
}
assert!(found_break, "Break node lost during optimization");
}
}