use super::tags::{tag_intent, TagIntent};
use crate::html::stack::StyleStack;
use docspec_core::Event;
use html5gum::{Token, Tokenizer};
pub(crate) fn tokenize_fragment(
input: &str,
) -> impl Iterator<Item = Result<Token, core::convert::Infallible>> + '_ {
Tokenizer::new(input)
}
pub(crate) fn classify_start_tag(name: &[u8]) -> Option<TagIntent> {
match tag_intent(name) {
TagIntent::Ignored => None,
intent => Some(intent),
}
}
#[derive(Default)]
pub(crate) struct BlockHeadingAccumulator {
open_level: Option<u8>,
nested_ignored_heading_depth: usize,
heading_text_emitted: bool,
}
impl BlockHeadingAccumulator {
pub(crate) fn open(&mut self, level: u8) -> Option<Event> {
if self.open_level.is_some() {
return None; }
self.open_level = Some(level);
self.nested_ignored_heading_depth = 0;
self.heading_text_emitted = false;
Some(Event::StartHeading { id: None, level })
}
pub(crate) fn close(&mut self) -> Option<Event> {
let was_open = self.open_level.take().is_some();
self.nested_ignored_heading_depth = 0;
self.heading_text_emitted = false;
was_open.then_some(Event::EndHeading)
}
pub(crate) fn finish_block(&mut self) -> Option<Event> {
self.close()
}
pub(crate) fn is_open(&self) -> bool {
self.open_level.is_some()
}
fn enter_nested_ignored_heading(&mut self) {
self.nested_ignored_heading_depth = self.nested_ignored_heading_depth.saturating_add(1);
}
fn exit_nested_ignored_heading(&mut self) {
self.nested_ignored_heading_depth = self.nested_ignored_heading_depth.saturating_sub(1);
}
fn is_inside_nested_ignored_heading(&self) -> bool {
self.nested_ignored_heading_depth > 0
}
fn has_heading_text(&self) -> bool {
self.heading_text_emitted
}
fn note_heading_text(&mut self) {
self.heading_text_emitted = true;
}
}
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub(crate) enum HtmlContext {
Inline,
Block,
}
pub(crate) fn translate_void(intent: &TagIntent, context: HtmlContext) -> Option<Event> {
match intent {
TagIntent::LineBreak => Some(Event::LineBreak),
TagIntent::ThematicBreak => match context {
HtmlContext::Block => Some(Event::ThematicBreak { id: None }),
HtmlContext::Inline => None,
},
_ => None,
}
}
pub(crate) fn translate_inline(
fragment: &str,
stack: &mut StyleStack,
in_preformatted: bool,
) -> Vec<Event> {
let mut out = Vec::new();
for token_result in tokenize_fragment(fragment) {
let token = match token_result {
Ok(token) => token,
Err(error) => match error {},
};
match token {
Token::StartTag(tag) => {
let intent = tag_intent(&tag.name);
match &intent {
TagIntent::Bold
| TagIntent::Italic
| TagIntent::Underline
| TagIntent::Strikethrough
| TagIntent::Code
| TagIntent::Subscript
| TagIntent::Superscript
| TagIntent::Mark => {
if !in_preformatted {
out.extend(stack.open(intent));
}
}
TagIntent::LineBreak | TagIntent::ThematicBreak => {
if let Some(event) = translate_void(&intent, HtmlContext::Inline) {
out.push(event);
}
}
TagIntent::Heading(_) | TagIntent::Ignored => {}
}
}
Token::EndTag(tag) => {
let intent = tag_intent(&tag.name);
match &intent {
TagIntent::Bold
| TagIntent::Italic
| TagIntent::Underline
| TagIntent::Strikethrough
| TagIntent::Code
| TagIntent::Subscript
| TagIntent::Superscript
| TagIntent::Mark => {
if !in_preformatted {
out.extend(stack.close(intent));
}
}
TagIntent::LineBreak
| TagIntent::ThematicBreak
| TagIntent::Heading(_)
| TagIntent::Ignored => {}
}
}
Token::String(text) => {
let content = String::from_utf8_lossy(&text.0).into_owned();
out.push(Event::Text { content });
}
Token::Comment(_) | Token::Doctype(_) | Token::Error(_) => {}
}
}
out
}
pub(crate) fn translate_block(
fragment: &str,
heading_acc: &mut BlockHeadingAccumulator,
inline_stack: &mut StyleStack,
in_preformatted: bool,
) -> Vec<Event> {
let mut out = Vec::new();
for token_result in tokenize_fragment(fragment) {
let token = match token_result {
Ok(token) => token,
Err(error) => match error {},
};
match token {
Token::StartTag(tag) => {
let intent = tag_intent(&tag.name);
match &intent {
TagIntent::Heading(level) => {
if heading_acc.is_open() {
heading_acc.enter_nested_ignored_heading();
continue;
}
if let Some(event) = heading_acc.open(*level) {
out.push(event);
}
}
TagIntent::Bold
| TagIntent::Italic
| TagIntent::Underline
| TagIntent::Strikethrough
| TagIntent::Code
| TagIntent::Subscript
| TagIntent::Superscript
| TagIntent::Mark => {
if heading_acc.is_open()
&& !heading_acc.is_inside_nested_ignored_heading()
&& !in_preformatted
{
out.extend(inline_stack.open(intent));
}
}
TagIntent::LineBreak | TagIntent::ThematicBreak => {
if let Some(event) = translate_void(&intent, HtmlContext::Block) {
out.push(event);
}
}
TagIntent::Ignored => {}
}
}
Token::EndTag(tag) => {
let intent = tag_intent(&tag.name);
match &intent {
TagIntent::Heading(_) => {
if heading_acc.is_inside_nested_ignored_heading() {
heading_acc.exit_nested_ignored_heading();
} else {
out.extend(inline_stack.close_all());
if let Some(event) = heading_acc.close() {
out.push(event);
}
}
}
TagIntent::Bold
| TagIntent::Italic
| TagIntent::Underline
| TagIntent::Strikethrough
| TagIntent::Code
| TagIntent::Subscript
| TagIntent::Superscript
| TagIntent::Mark => {
if heading_acc.is_open()
&& !heading_acc.is_inside_nested_ignored_heading()
&& !in_preformatted
{
out.extend(inline_stack.close(intent));
}
}
TagIntent::LineBreak | TagIntent::ThematicBreak | TagIntent::Ignored => {}
}
}
Token::String(text) => {
if heading_acc.is_open() && !heading_acc.is_inside_nested_ignored_heading() {
let content = String::from_utf8_lossy(&text.0).into_owned();
if !heading_acc.has_heading_text() && content.trim().is_empty() {
continue;
}
heading_acc.note_heading_text();
out.extend(inline_stack.note_text());
out.push(Event::Text { content });
}
}
Token::Comment(_) | Token::Doctype(_) | Token::Error(_) => {}
}
}
out
}
#[cfg(test)]
mod tests {
#![allow(
clippy::doc_markdown,
clippy::indexing_slicing,
clippy::panic,
clippy::single_match_else
)]
use super::*;
use crate::html::MARK_COLOR;
use docspec_core::TextStyleKind;
fn start(kind: TextStyleKind) -> Event {
Event::StartTextStyle { kind, id: None }
}
#[test]
fn tokenize_empty() {
let tokens: Vec<_> = tokenize_fragment("")
.collect::<Result<Vec<_>, _>>()
.unwrap();
assert_eq!(tokens.len(), 0, "empty input should yield no tokens");
}
#[test]
fn tokenize_start_tag() {
let tokens: Vec<_> = tokenize_fragment("<b>")
.collect::<Result<Vec<_>, _>>()
.unwrap();
assert_eq!(tokens.len(), 1, "should have exactly one token");
match &tokens[0] {
Token::StartTag(tag) => {
assert_eq!(tag.name, b"b", "tag name should be lowercase 'b'");
}
other => panic!("expected StartTag, got {other:?}"),
}
}
#[test]
fn tokenize_end_tag() {
let tokens: Vec<_> = tokenize_fragment("</b>")
.collect::<Result<Vec<_>, _>>()
.unwrap();
assert_eq!(tokens.len(), 1, "should have exactly one token");
match &tokens[0] {
Token::EndTag(tag) => {
assert_eq!(tag.name, b"b", "tag name should be lowercase 'b'");
}
other => panic!("expected EndTag, got {other:?}"),
}
}
#[test]
fn tokenize_self_closing() {
let tokens: Vec<_> = tokenize_fragment("<br/>")
.collect::<Result<Vec<_>, _>>()
.unwrap();
assert_eq!(tokens.len(), 1, "should have exactly one token");
match &tokens[0] {
Token::StartTag(tag) => {
assert_eq!(tag.name, b"br", "tag name should be 'br'");
assert!(tag.self_closing, "br should be marked self-closing");
}
other => panic!("expected StartTag with self_closing=true, got {other:?}"),
}
}
#[test]
fn tokenize_text_inside_tag() {
let tokens: Vec<_> = tokenize_fragment("<b>hello</b>")
.collect::<Result<Vec<_>, _>>()
.unwrap();
assert_eq!(tokens.len(), 3, "should have StartTag, String, EndTag");
match &tokens[0] {
Token::StartTag(tag) => assert_eq!(tag.name, b"b"),
other => panic!("expected StartTag, got {other:?}"),
}
match &tokens[1] {
Token::String(s) => {
let text = String::from_utf8_lossy(&s.0);
assert_eq!(text, "hello", "text content should be 'hello'");
}
other => panic!("expected String, got {other:?}"),
}
match &tokens[2] {
Token::EndTag(tag) => assert_eq!(tag.name, b"b"),
other => panic!("expected EndTag, got {other:?}"),
}
}
#[test]
fn tokenize_uppercase_normalized_to_lowercase() {
let tokens: Vec<_> = tokenize_fragment("<B>X</B>")
.collect::<Result<Vec<_>, _>>()
.unwrap();
assert_eq!(tokens.len(), 3, "should have StartTag, String, EndTag");
match &tokens[0] {
Token::StartTag(tag) => {
assert_eq!(
tag.name, b"b",
"uppercase B should be normalized to lowercase b"
);
}
other => panic!("expected StartTag, got {other:?}"),
}
match &tokens[2] {
Token::EndTag(tag) => {
assert_eq!(
tag.name, b"b",
"uppercase B should be normalized to lowercase b"
);
}
other => panic!("expected EndTag, got {other:?}"),
}
}
#[test]
fn tokenize_handles_entities() {
let tokens: Vec<_> = tokenize_fragment("<b>&</b>")
.collect::<Result<Vec<_>, _>>()
.unwrap();
assert_eq!(tokens.len(), 3, "should have StartTag, String, EndTag");
match &tokens[1] {
Token::String(s) => {
let text = String::from_utf8_lossy(&s.0);
assert_eq!(text, "&", "entity & should be decoded to &");
}
other => panic!("expected String, got {other:?}"),
}
}
#[test]
fn tokenize_malformed_no_panic() {
let _count = tokenize_fragment("<b oops").count();
}
#[test]
fn block_heading_single_line_open_close() {
let mut acc = BlockHeadingAccumulator::default();
let start = acc.open(1);
let end = acc.close();
assert_eq!(
start,
Some(Event::StartHeading { id: None, level: 1 }),
"open(1) should emit StartHeading level 1"
);
assert_eq!(
end,
Some(Event::EndHeading),
"close() should emit EndHeading"
);
}
#[test]
fn block_heading_multi_line_open_then_close_separately() {
let mut acc = BlockHeadingAccumulator::default();
let start = acc.open(3);
assert_eq!(
start,
Some(Event::StartHeading { id: None, level: 3 }),
"open(3) should return StartHeading level 3"
);
assert!(acc.is_open(), "heading should be open after open()");
let end = acc.close();
assert_eq!(
end,
Some(Event::EndHeading),
"close() should return EndHeading"
);
assert!(!acc.is_open(), "heading should be closed after close()");
}
#[test]
fn block_heading_nested_heading_ignored() {
let mut acc = BlockHeadingAccumulator::default();
let first = acc.open(1);
let second = acc.open(2);
assert_eq!(
first,
Some(Event::StartHeading { id: None, level: 1 }),
"first open should succeed"
);
assert_eq!(
second, None,
"nested heading open should be silently ignored"
);
let end = acc.close();
assert_eq!(
end,
Some(Event::EndHeading),
"close should end the first heading"
);
}
#[test]
fn block_heading_close_without_open() {
let mut acc = BlockHeadingAccumulator::default();
let result = acc.close();
assert_eq!(
result, None,
"close() with no open heading should return None"
);
}
#[test]
fn block_heading_finish_block_auto_closes() {
let mut acc = BlockHeadingAccumulator::default();
acc.open(2);
let result = acc.finish_block();
assert_eq!(
result,
Some(Event::EndHeading),
"finish_block() should auto-close an open heading"
);
assert!(
!acc.is_open(),
"heading should be closed after finish_block()"
);
}
#[test]
fn block_heading_finish_block_when_clean() {
let mut acc = BlockHeadingAccumulator::default();
let result = acc.finish_block();
assert_eq!(
result, None,
"finish_block() with no open heading should return None"
);
}
#[test]
fn translate_void_br_in_inline_emits_linebreak() {
let result = translate_void(&TagIntent::LineBreak, HtmlContext::Inline);
assert_eq!(
result,
Some(Event::LineBreak),
"LineBreak in Inline context should emit LineBreak"
);
}
#[test]
fn translate_void_br_in_block_emits_linebreak() {
let result = translate_void(&TagIntent::LineBreak, HtmlContext::Block);
assert_eq!(
result,
Some(Event::LineBreak),
"LineBreak in Block context should emit LineBreak"
);
}
#[test]
fn translate_void_hr_in_block_emits_thematic_break() {
let result = translate_void(&TagIntent::ThematicBreak, HtmlContext::Block);
assert_eq!(
result,
Some(Event::ThematicBreak { id: None }),
"ThematicBreak in Block context should emit ThematicBreak"
);
}
#[test]
fn translate_void_hr_in_inline_returns_none() {
let result = translate_void(&TagIntent::ThematicBreak, HtmlContext::Inline);
assert_eq!(
result, None,
"ThematicBreak in Inline context should return None"
);
}
#[test]
fn translate_void_non_void_intent_returns_none() {
let result = translate_void(&TagIntent::Bold, HtmlContext::Block);
assert_eq!(
result, None,
"Bold intent should return None (not a void element)"
);
}
#[test]
fn translate_void_heading_intent_returns_none() {
let result = translate_void(&TagIntent::Heading(2), HtmlContext::Block);
assert_eq!(
result, None,
"Heading intent should return None (not a void element)"
);
}
#[test]
fn translate_inline_bold_open_close_with_text() {
let mut stack = StyleStack::default();
assert_eq!(translate_inline("<b>", &mut stack, false), Vec::new());
assert_eq!(stack.note_text(), vec![start(TextStyleKind::Bold)]);
assert_eq!(
translate_inline("</b>", &mut stack, false),
vec![Event::EndTextStyle]
);
assert!(stack.is_empty());
}
#[test]
fn translate_inline_italic_open_close() {
let mut stack = StyleStack::default();
assert_eq!(translate_inline("<em>", &mut stack, false), Vec::new());
assert_eq!(stack.note_text(), vec![start(TextStyleKind::Italic)]);
assert_eq!(
translate_inline("</em>", &mut stack, false),
vec![Event::EndTextStyle]
);
assert!(stack.is_empty());
}
#[test]
fn translate_inline_bold_no_intervening_text() {
let mut stack = StyleStack::default();
assert_eq!(translate_inline("<b>", &mut stack, false), Vec::new());
assert_eq!(translate_inline("</b>", &mut stack, false), Vec::new());
assert!(stack.is_empty());
}
#[test]
fn translate_inline_hr_dropped_in_inline_context() {
let mut stack = StyleStack::default();
assert_eq!(translate_inline("<hr>", &mut stack, false), Vec::new());
assert!(stack.is_empty());
}
#[test]
fn translate_inline_heading_dropped_in_inline_context() {
let mut stack = StyleStack::default();
assert_eq!(translate_inline("<h1>", &mut stack, false), Vec::new());
assert!(stack.is_empty());
}
#[test]
fn translate_inline_br_emits_linebreak() {
let mut stack = StyleStack::default();
assert_eq!(
translate_inline("<br>", &mut stack, false),
vec![Event::LineBreak]
);
assert!(stack.is_empty());
}
#[test]
fn translate_inline_br_self_closing() {
let mut stack = StyleStack::default();
assert_eq!(
translate_inline("<br/>", &mut stack, false),
vec![Event::LineBreak]
);
assert!(stack.is_empty());
}
#[test]
fn translate_inline_uppercase_tag_normalized() {
let mut stack = StyleStack::default();
assert_eq!(translate_inline("<B>", &mut stack, false), Vec::new());
assert_eq!(stack.note_text(), vec![start(TextStyleKind::Bold)]);
assert_eq!(
translate_inline("</B>", &mut stack, false),
vec![Event::EndTextStyle]
);
assert!(stack.is_empty());
}
#[test]
fn translate_inline_unknown_tag_silently_dropped() {
let mut stack = StyleStack::default();
assert_eq!(translate_inline("<div>", &mut stack, false), Vec::new());
assert!(stack.is_empty());
}
#[test]
fn translate_inline_malformed_no_panic() {
let mut stack = StyleStack::default();
assert_eq!(translate_inline("<b oops", &mut stack, false), Vec::new());
}
#[test]
fn translate_inline_mark_uses_constant_color() {
let mut stack = StyleStack::default();
assert_eq!(translate_inline("<mark>", &mut stack, false), Vec::new());
assert_eq!(
stack.note_text(),
vec![start(TextStyleKind::Mark(MARK_COLOR))]
);
assert_eq!(
translate_inline("</mark>", &mut stack, false),
vec![Event::EndTextStyle]
);
assert!(stack.is_empty());
}
#[test]
fn translate_inline_preformatted_suppresses_styles() {
let mut stack = StyleStack::default();
assert_eq!(translate_inline("<b>", &mut stack, true), Vec::new());
assert!(stack.is_empty());
}
#[test]
fn translate_inline_styles_suppressed_in_preformatted() {
let mut stack = StyleStack::default();
let events = translate_inline("<b>text</b>", &mut stack, true);
assert_eq!(
events,
vec![Event::Text {
content: "text".to_owned()
}],
"open and close style tags suppressed; text preserved"
);
}
#[test]
fn translate_inline_text_passthrough_in_preformatted() {
let mut stack = StyleStack::default();
let events = translate_inline("<em>hello world</em>", &mut stack, true);
assert_eq!(
events,
vec![Event::Text {
content: "hello world".to_owned()
}],
"text inside a style tag is preserved even when in_preformatted=true"
);
}
#[test]
fn translate_inline_linebreak_emitted_in_preformatted() {
let mut stack = StyleStack::default();
let events = translate_inline("<br>", &mut stack, true);
assert_eq!(
events,
vec![Event::LineBreak],
"LineBreak is a void element unaffected by Rule 11; emitted even in preformatted"
);
}
#[test]
fn translate_inline_stack_state_unchanged_when_suppressed() {
let mut stack = StyleStack::default();
translate_inline("<b>", &mut stack, true);
assert!(
stack.is_empty(),
"stack must not be modified when in_preformatted=true"
);
}
#[test]
fn translate_inline_non_preformatted_still_emits_styles() {
let mut stack = StyleStack::default();
let open_events = translate_inline("<b>", &mut stack, false);
assert_eq!(
open_events,
Vec::new(),
"open returns nothing (deferred until text)"
);
let start_events = stack.note_text();
assert_eq!(
start_events,
vec![start(TextStyleKind::Bold)],
"regression: normal mode (in_preformatted=false) still defers and emits StartTextStyle"
);
let close_events = translate_inline("</b>", &mut stack, false);
assert_eq!(
close_events,
vec![Event::EndTextStyle],
"regression: close emits EndTextStyle"
);
assert!(stack.is_empty());
}
fn translate_block_with_default_state(fragment: &str) -> Vec<Event> {
let mut heading_acc = BlockHeadingAccumulator::default();
let mut inline_stack = StyleStack::default();
translate_block(fragment, &mut heading_acc, &mut inline_stack, false)
}
#[test]
fn translate_block_single_line_h1() {
assert_eq!(
translate_block_with_default_state("<h1>Title</h1>"),
vec![
Event::StartHeading { id: None, level: 1 },
Event::Text {
content: "Title".to_owned()
},
Event::EndHeading
]
);
}
#[test]
fn translate_block_single_line_h6() {
assert_eq!(
translate_block_with_default_state("<h6>Title</h6>"),
vec![
Event::StartHeading { id: None, level: 6 },
Event::Text {
content: "Title".to_owned()
},
Event::EndHeading
]
);
}
#[test]
fn translate_block_h7_dropped() {
assert_eq!(translate_block_with_default_state("<h7>X</h7>"), Vec::new());
}
#[test]
fn translate_block_multi_line_open_then_close() {
let mut heading_acc = BlockHeadingAccumulator::default();
let mut inline_stack = StyleStack::default();
assert_eq!(
translate_block("<h1>\n", &mut heading_acc, &mut inline_stack, false),
vec![Event::StartHeading { id: None, level: 1 }]
);
assert_eq!(
translate_block(" Title\n", &mut heading_acc, &mut inline_stack, false),
vec![Event::Text {
content: " Title\n".to_owned()
}]
);
assert_eq!(
translate_block("</h1>\n", &mut heading_acc, &mut inline_stack, false),
vec![Event::EndHeading]
);
}
#[test]
fn translate_block_nested_inline_inside_heading() {
assert_eq!(
translate_block_with_default_state("<h1><b>Bold Title</b></h1>"),
vec![
Event::StartHeading { id: None, level: 1 },
start(TextStyleKind::Bold),
Event::Text {
content: "Bold Title".to_owned()
},
Event::EndTextStyle,
Event::EndHeading
]
);
}
#[test]
fn translate_block_inline_styles_auto_closed_on_heading_end() {
assert_eq!(
translate_block_with_default_state("<h1><b>oops</h1>"),
vec![
Event::StartHeading { id: None, level: 1 },
start(TextStyleKind::Bold),
Event::Text {
content: "oops".to_owned()
},
Event::EndTextStyle,
Event::EndHeading
]
);
}
#[test]
fn translate_block_hr_in_block_context_emits_thematic_break() {
assert_eq!(
translate_block_with_default_state("<hr>"),
vec![Event::ThematicBreak { id: None }]
);
}
#[test]
fn translate_block_br_in_heading_emits_linebreak() {
assert_eq!(
translate_block_with_default_state("<h1>line1<br/>line2</h1>"),
vec![
Event::StartHeading { id: None, level: 1 },
Event::Text {
content: "line1".to_owned()
},
Event::LineBreak,
Event::Text {
content: "line2".to_owned()
},
Event::EndHeading
]
);
}
#[test]
fn translate_block_text_outside_heading_dropped() {
assert_eq!(translate_block_with_default_state("some text"), Vec::new());
}
#[test]
fn translate_block_out_of_scope_tag_dropped() {
assert_eq!(
translate_block_with_default_state("<table>x</table>"),
Vec::new()
);
}
#[test]
fn translate_block_malformed_no_panic() {
let mut heading_acc = BlockHeadingAccumulator::default();
let mut inline_stack = StyleStack::default();
let _events = translate_block("<h1 oops", &mut heading_acc, &mut inline_stack, false);
}
#[test]
fn translate_block_nested_heading_inner_ignored() {
assert_eq!(
translate_block_with_default_state("<h1>outer<h2>inner</h2></h1>"),
vec![
Event::StartHeading { id: None, level: 1 },
Event::Text {
content: "outer".to_owned()
},
Event::EndHeading
]
);
}
#[test]
fn translate_block_suppresses_inline_in_pre() {
let mut heading_acc = BlockHeadingAccumulator::default();
let mut inline_stack = StyleStack::default();
let events = translate_block(
"<h1><b>text</b></h1>",
&mut heading_acc,
&mut inline_stack,
true,
);
assert_eq!(
events,
vec![
Event::StartHeading { id: None, level: 1 },
Event::Text {
content: "text".to_owned()
},
Event::EndHeading,
],
"Rule 11: Bold events suppressed inside preformatted; text and heading structure preserved"
);
}
}