use super::{InputParser, InputParserConfig};
use crate::elements::{ContentElement, FontSpec, TextContent, TextStyle};
use crate::error::Result;
use crate::geometry::Rect;
use crate::layout::FontWeight;
#[derive(Debug, Default)]
pub struct HtmlParser;
impl HtmlParser {
pub fn new() -> Self {
Self
}
}
impl InputParser for HtmlParser {
fn parse(&self, input: &str, config: &InputParserConfig) -> Result<Vec<ContentElement>> {
let mut elements = Vec::new();
let mut y_position = config.content_start_y();
let x_position = config.margin_left;
let mut reading_order = 0;
let cleaned = strip_html_comments(input);
let mut current_text = String::new();
let mut in_tag = false;
let mut tag_name = String::new();
let mut current_style = TextStyleState::default();
let chars: Vec<char> = cleaned.chars().collect();
let mut i = 0;
while i < chars.len() {
let ch = chars[i];
if ch == '<' {
if !current_text.trim().is_empty() {
let element = create_text_element(
¤t_text,
x_position,
y_position,
config,
¤t_style,
reading_order,
);
y_position -= element.bbox().height * 1.2;
elements.push(element);
reading_order += 1;
}
current_text.clear();
in_tag = true;
tag_name.clear();
} else if ch == '>' && in_tag {
in_tag = false;
let tag = tag_name.trim().to_lowercase();
let (tag_type, closing) = parse_tag(&tag);
match tag_type.as_str() {
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
if !closing {
let level: u8 =
tag_type.chars().last().unwrap().to_digit(10).unwrap() as u8;
current_style.heading_level = Some(level);
current_style.bold = level <= 2;
} else {
current_style.heading_level = None;
current_style.bold = false;
y_position -= config.default_font_size * 0.5; }
},
"p" => {
if closing {
y_position -= config.default_font_size * 0.5; }
},
"br" => {
y_position -= config.default_font_size * config.line_height;
},
"hr" => {
let hr_element = create_horizontal_rule(
x_position,
y_position,
config.page_width - config.margin_left - config.margin_right,
reading_order,
);
elements.push(hr_element);
reading_order += 1;
y_position -= config.default_font_size;
},
"b" | "strong" => {
current_style.bold = !closing;
},
"i" | "em" => {
current_style.italic = !closing;
},
"code" | "pre" => {
current_style.monospace = !closing;
},
"li" => {
if !closing {
current_text.push_str("• ");
} else {
y_position -= config.default_font_size * 0.3; }
},
"ul" | "ol" => {
if closing {
y_position -= config.default_font_size * 0.3; }
},
_ => {},
}
tag_name.clear();
} else if in_tag {
tag_name.push(ch);
} else {
if ch == '&' {
let entity_end = chars[i..].iter().position(|&c| c == ';');
if let Some(end) = entity_end {
let entity: String = chars[i..i + end + 1].iter().collect();
let decoded = decode_html_entity(&entity);
current_text.push_str(&decoded);
i += end;
} else {
current_text.push(ch);
}
} else if ch == '\n' || ch == '\r' {
if !current_text.ends_with(' ') && !current_text.is_empty() {
current_text.push(' ');
}
} else {
current_text.push(ch);
}
}
i += 1;
}
if !current_text.trim().is_empty() {
let element = create_text_element(
¤t_text,
x_position,
y_position,
config,
¤t_style,
reading_order,
);
elements.push(element);
}
Ok(elements)
}
fn name(&self) -> &'static str {
"html"
}
fn mime_type(&self) -> &'static str {
"text/html"
}
fn extensions(&self) -> &[&'static str] {
&["html", "htm"]
}
}
#[derive(Debug, Default, Clone)]
struct TextStyleState {
bold: bool,
italic: bool,
monospace: bool,
heading_level: Option<u8>,
}
fn parse_tag(tag: &str) -> (String, bool) {
let tag = tag.trim();
if tag.starts_with('/') {
(tag[1..].split_whitespace().next().unwrap_or("").to_string(), true)
} else {
(tag.split_whitespace().next().unwrap_or("").to_string(), false)
}
}
fn strip_html_comments(input: &str) -> String {
let mut result = String::new();
let mut in_comment = false;
let chars: Vec<char> = input.chars().collect();
let mut i = 0;
while i < chars.len() {
if !in_comment && i + 3 < chars.len() {
let slice: String = chars[i..i + 4].iter().collect();
if slice == "<!--" {
in_comment = true;
i += 4;
continue;
}
}
if in_comment && i + 2 < chars.len() {
let slice: String = chars[i..i + 3].iter().collect();
if slice == "-->" {
in_comment = false;
i += 3;
continue;
}
}
if !in_comment {
result.push(chars[i]);
}
i += 1;
}
result
}
fn decode_html_entity(entity: &str) -> String {
match entity {
"&" => "&".to_string(),
"<" => "<".to_string(),
">" => ">".to_string(),
""" => "\"".to_string(),
"'" => "'".to_string(),
" " => " ".to_string(),
"—" | "&emdash;" => "—".to_string(),
"–" | "&endash;" => "–".to_string(),
"©" => "©".to_string(),
"®" => "®".to_string(),
"™" => "™".to_string(),
"…" => "…".to_string(),
_ => {
if entity.starts_with("&#") && entity.ends_with(';') {
let num_str = &entity[2..entity.len() - 1];
if let Some(stripped) = num_str.strip_prefix('x') {
if let Ok(code) = u32::from_str_radix(stripped, 16) {
if let Some(ch) = char::from_u32(code) {
return ch.to_string();
}
}
} else {
if let Ok(code) = num_str.parse::<u32>() {
if let Some(ch) = char::from_u32(code) {
return ch.to_string();
}
}
}
}
entity.to_string()
},
}
}
fn create_text_element(
text: &str,
x: f32,
y: f32,
config: &InputParserConfig,
style_state: &TextStyleState,
reading_order: usize,
) -> ContentElement {
let text = normalize_whitespace(text);
let font_size = if let Some(level) = style_state.heading_level {
match level {
1 => 24.0,
2 => 20.0,
3 => 16.0,
4 => 14.0,
_ => 12.0,
}
} else {
config.default_font_size
};
let font_name = if style_state.monospace {
if style_state.bold {
"Courier-Bold"
} else {
"Courier"
}
} else if style_state.bold && style_state.italic {
"Helvetica-BoldOblique"
} else if style_state.bold {
"Helvetica-Bold"
} else if style_state.italic {
"Helvetica-Oblique"
} else {
"Helvetica"
};
let weight = if style_state.bold {
FontWeight::Bold
} else {
FontWeight::Normal
};
ContentElement::Text(TextContent {
artifact_type: None,
text,
bbox: Rect::new(
x, y, 400.0, font_size,
),
font: FontSpec {
name: font_name.to_string(),
size: font_size,
},
style: TextStyle {
weight,
italic: style_state.italic,
..Default::default()
},
reading_order: Some(reading_order),
})
}
fn create_horizontal_rule(x: f32, y: f32, width: f32, reading_order: usize) -> ContentElement {
use crate::elements::{LineCap, LineJoin, PathContent, PathOperation};
use crate::layout::Color;
ContentElement::Path(PathContent {
operations: vec![
PathOperation::MoveTo(x, y),
PathOperation::LineTo(x + width, y),
],
bbox: Rect::new(x, y, width, 1.0),
stroke_color: Some(Color {
r: 0.7,
g: 0.7,
b: 0.7,
}),
fill_color: None,
stroke_width: 1.0,
line_cap: LineCap::Butt,
line_join: LineJoin::Miter,
reading_order: Some(reading_order),
})
}
fn normalize_whitespace(text: &str) -> String {
text.split_whitespace().collect::<Vec<_>>().join(" ")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_html_parser_creation() {
let parser = HtmlParser::new();
assert_eq!(parser.name(), "html");
assert_eq!(parser.mime_type(), "text/html");
}
#[test]
fn test_parse_simple_html() {
let parser = HtmlParser::new();
let config = InputParserConfig::default();
let html = "<p>Hello, World!</p>";
let elements = parser.parse(html, &config).unwrap();
assert!(!elements.is_empty());
if let ContentElement::Text(text) = &elements[0] {
assert_eq!(text.text, "Hello, World!");
} else {
panic!("Expected text element");
}
}
#[test]
fn test_parse_headings() {
let parser = HtmlParser::new();
let config = InputParserConfig::default();
let html = "<h1>Title</h1><p>Content</p>";
let elements = parser.parse(html, &config).unwrap();
assert!(elements.len() >= 2);
if let ContentElement::Text(text) = &elements[0] {
assert_eq!(text.text, "Title");
assert_eq!(text.font.size, 24.0); } else {
panic!("Expected text element for heading");
}
}
#[test]
fn test_parse_bold_italic() {
let parser = HtmlParser::new();
let config = InputParserConfig::default();
let html = "<b>Bold</b> <i>Italic</i>";
let elements = parser.parse(html, &config).unwrap();
assert!(!elements.is_empty());
}
#[test]
fn test_html_entities() {
let parser = HtmlParser::new();
let config = InputParserConfig::default();
let html = "<p>& < > "</p>";
let elements = parser.parse(html, &config).unwrap();
if let ContentElement::Text(text) = &elements[0] {
assert!(text.text.contains('&'));
assert!(text.text.contains('<'));
assert!(text.text.contains('>'));
}
}
#[test]
fn test_strip_comments() {
let input = "Hello <!-- this is a comment --> World";
let result = strip_html_comments(input);
assert_eq!(result, "Hello World");
}
#[test]
fn test_horizontal_rule() {
let parser = HtmlParser::new();
let config = InputParserConfig::default();
let html = "<p>Before</p><hr><p>After</p>";
let elements = parser.parse(html, &config).unwrap();
let has_path = elements
.iter()
.any(|e| matches!(e, ContentElement::Path(_)));
assert!(has_path);
}
#[test]
fn test_list_items() {
let parser = HtmlParser::new();
let config = InputParserConfig::default();
let html = "<ul><li>Item 1</li><li>Item 2</li></ul>";
let elements = parser.parse(html, &config).unwrap();
let texts: Vec<_> = elements
.iter()
.filter_map(|e| {
if let ContentElement::Text(t) = e {
Some(&t.text)
} else {
None
}
})
.collect();
assert!(texts.iter().any(|t| t.contains('•')));
}
#[test]
fn test_numeric_entity() {
let decoded = decode_html_entity("A");
assert_eq!(decoded, "A");
let decoded_hex = decode_html_entity("A");
assert_eq!(decoded_hex, "A");
}
#[test]
fn test_parse_tag() {
let (name, closing) = parse_tag("p");
assert_eq!(name, "p");
assert!(!closing);
let (name, closing) = parse_tag("/p");
assert_eq!(name, "p");
assert!(closing);
let (name, closing) = parse_tag("div class=\"test\"");
assert_eq!(name, "div");
assert!(!closing);
}
}