extern crate alloc;
use alloc::collections::VecDeque;
use std::io::{Cursor, Read, Seek};
pub use docspec_core::EventSource;
use docspec_core::{Event, Result};
use html5gum::{IoReader, Tokenizer};
#[derive(Clone, Copy, PartialEq, Eq)]
enum Phase {
Finished,
NotStarted,
Running,
}
pub struct HtmlReader {
in_paragraph: bool,
phase: Phase,
queue: VecDeque<Event>,
tokens: Tokenizer<IoReader<Box<dyn Read + Send>>>,
}
impl HtmlReader {
fn drain_queue(&mut self) -> Option<Event> {
self.queue.pop_front()
}
fn from_boxed_reader(reader: Box<dyn Read + Send>) -> Self {
Self {
in_paragraph: false,
phase: Phase::NotStarted,
queue: VecDeque::new(),
tokens: Tokenizer::new(IoReader::new(reader)),
}
}
#[inline]
pub fn from_reader<R: Read + Seek + Send + 'static>(reader: R) -> Result<Self> {
let boxed: Box<dyn Read + Send> = Box::new(reader);
Ok(Self::from_boxed_reader(boxed))
}
#[expect(
clippy::should_implement_trait,
reason = "Public API requires an infallible constructor named from_str."
)]
#[inline]
#[must_use]
pub fn from_str(input: &str) -> Self {
let bytes: Vec<u8> = input.as_bytes().to_vec();
let reader: Box<dyn Read + Send> = Box::new(Cursor::new(bytes));
Self::from_boxed_reader(reader)
}
fn handle_end_tag(&mut self, tag: &html5gum::EndTag<()>) {
if &*tag.name == b"p" && self.in_paragraph {
self.queue.push_back(Event::EndParagraph);
self.in_paragraph = false;
}
}
fn handle_eof(&mut self) {
if self.in_paragraph {
self.queue.push_back(Event::EndParagraph);
self.in_paragraph = false;
}
self.queue.push_back(Event::EndDocument);
self.phase = Phase::Finished;
}
fn handle_start_tag(&mut self, tag: &html5gum::StartTag<()>) {
if &*tag.name != b"p" || self.in_paragraph {
return;
}
self.queue.push_back(Event::StartParagraph {
alignment: None,
id: None,
});
self.in_paragraph = true;
if tag.self_closing {
self.queue.push_back(Event::EndParagraph);
self.in_paragraph = false;
}
}
fn handle_text(&mut self, text_bytes: &[u8]) -> Result<()> {
if self.in_paragraph {
let text =
core::str::from_utf8(text_bytes).map_err(|e| docspec_core::Error::Parse {
message: format!("invalid UTF-8 in HTML text: {e}"),
position: None,
})?;
self.queue.push_back(Event::Text {
content: text.to_string(),
});
}
Ok(())
}
}
impl EventSource for HtmlReader {
#[inline]
fn next_event(&mut self) -> Result<Option<Event>> {
loop {
if let Some(event) = self.drain_queue() {
return Ok(Some(event));
}
match self.phase {
Phase::NotStarted => {
self.phase = Phase::Running;
self.queue.push_back(Event::StartDocument {
id: None,
language: None,
metadata: None,
});
}
Phase::Finished => {
return Ok(None);
}
Phase::Running => {
let Some(result) = self.tokens.next() else {
self.handle_eof();
continue;
};
match result {
Ok(token) => match token {
html5gum::Token::StartTag(tag) => {
self.handle_start_tag(&tag);
}
html5gum::Token::EndTag(tag) => {
self.handle_end_tag(&tag);
}
html5gum::Token::String(spanned) => {
self.handle_text(&spanned.value.0)?;
}
html5gum::Token::Comment(_) | html5gum::Token::Doctype(_) => {
}
html5gum::Token::Error(spanned) => {
return Err(docspec_core::Error::Parse {
message: format!("html5gum: {:?}", spanned.value),
position: None,
});
}
},
Err(source) => return Err(docspec_core::Error::Io { source }),
}
}
}
}
}
}
#[cfg(test)]
mod send_static_assertions {
fn assert_send_static<T: Send + 'static>() {}
#[test]
fn html_reader_is_send_static() {
assert_send_static::<crate::HtmlReader>();
}
}