extern crate alloc;
use alloc::collections::VecDeque;
pub use docspec_core::EventSource;
use docspec_core::{Event, Result, TextStyle};
use html5gum::{StringReader, Tokenizer};
#[derive(Clone, Copy, PartialEq, Eq)]
enum Phase {
Finished,
NotStarted,
Running,
}
pub struct HtmlReader<'a> {
in_paragraph: bool,
phase: Phase,
queue: VecDeque<Event>,
tokens: Tokenizer<StringReader<'a>>,
}
impl<'a> HtmlReader<'a> {
fn drain_queue(&mut self) -> Option<Event> {
self.queue.pop_front()
}
fn handle_end_tag(&mut self, tag: &html5gum::EndTag<()>) {
if &*tag.name == b"p" && self.in_paragraph {
self.queue.push_back(Event::EndParagraph);
self.in_paragraph = false;
}
}
fn handle_eof(&mut self) {
if self.in_paragraph {
self.queue.push_back(Event::EndParagraph);
self.in_paragraph = false;
}
self.queue.push_back(Event::EndDocument);
self.phase = Phase::Finished;
}
fn handle_start_tag(&mut self, tag: &html5gum::StartTag<()>) {
if &*tag.name != b"p" || self.in_paragraph {
return;
}
self.queue.push_back(Event::StartParagraph {
alignment: None,
id: None,
});
self.in_paragraph = true;
if tag.self_closing {
self.queue.push_back(Event::EndParagraph);
self.in_paragraph = false;
}
}
fn handle_text(&mut self, text_bytes: &[u8]) -> Result<()> {
if self.in_paragraph {
let text =
core::str::from_utf8(text_bytes).map_err(|e| docspec_core::Error::Parse {
message: format!("invalid UTF-8 in HTML text: {e}"),
position: None,
})?;
self.queue.push_back(Event::Text {
content: text.to_string(),
style: TextStyle::default(),
});
}
Ok(())
}
#[inline]
#[must_use]
pub fn new(input: &'a str) -> Self {
Self {
in_paragraph: false,
phase: Phase::NotStarted,
queue: VecDeque::new(),
tokens: Tokenizer::new(input),
}
}
}
impl EventSource for HtmlReader<'_> {
#[inline]
fn next_event(&mut self) -> Result<Option<Event>> {
loop {
if let Some(event) = self.drain_queue() {
return Ok(Some(event));
}
match self.phase {
Phase::NotStarted => {
self.phase = Phase::Running;
self.queue.push_back(Event::StartDocument {
id: None,
language: None,
metadata: None,
});
}
Phase::Finished => {
return Ok(None);
}
Phase::Running => {
let Some(result) = self.tokens.next() else {
self.handle_eof();
continue;
};
match result {
Ok(token) => match token {
html5gum::Token::StartTag(tag) => {
self.handle_start_tag(&tag);
}
html5gum::Token::EndTag(tag) => {
self.handle_end_tag(&tag);
}
html5gum::Token::String(spanned) => {
self.handle_text(&spanned.value.0)?;
}
html5gum::Token::Comment(_) | html5gum::Token::Doctype(_) => {
}
html5gum::Token::Error(spanned) => {
return Err(docspec_core::Error::Parse {
message: format!("html5gum: {:?}", spanned.value),
position: None,
});
}
},
Err(infallible) => match infallible {},
}
}
}
}
}
}