docspec_html_reader/lib.rs
1//! HTML to `DocSpec` event stream reader.
2//!
3//! This crate provides an [`HtmlReader`] that implements [`EventSource`] to convert
4//! HTML documents into the `DocSpec` event stream format. It uses `html5gum`
5//! to parse HTML5-compliant markup and emits typed events representing document
6//! structure.
7//!
8//! # Quick Start
9//!
10//! ```
11//! use docspec_html_reader::{HtmlReader, EventSource};
12//!
13//! let html = "<p>Hello world</p>";
14//! let mut reader = HtmlReader::new(html);
15//!
16//! while let Some(event) = reader.next_event()? {
17//! println!("{event:?}");
18//! }
19//! # Ok::<(), docspec_core::Error>(())
20//! ```
21//!
22//! # Supported Elements
23//!
24//! - Paragraphs → `StartParagraph` / `EndParagraph`
25//!
26//! # Unsupported Elements
27//!
28//! All other HTML elements are silently ignored. Text content inside inline
29//! elements (e.g., `<strong>`, `<em>`) is preserved as `Text` events, but
30//! the formatting structure is dropped.
31
32extern crate alloc;
33
34use alloc::collections::VecDeque;
35
36pub use docspec_core::EventSource;
37use docspec_core::{Event, Result, TextStyle};
38use html5gum::{StringReader, Tokenizer};
39
40/// Document processing phase.
41#[derive(Clone, Copy, PartialEq, Eq)]
42enum Phase {
43 /// `EndDocument` has been emitted.
44 Finished,
45 /// `StartDocument` not yet emitted.
46 NotStarted,
47 /// Processing events between `StartDocument` and `EndDocument`.
48 Running,
49}
50
51/// A streaming HTML reader that implements [`EventSource`].
52///
53/// `HtmlReader` parses HTML using `html5gum` and emits `DocSpec` events
54/// one at a time. Only `<p>` paragraph elements are recognized; all other
55/// elements are silently ignored.
56///
57/// # Example
58///
59/// ```
60/// use docspec_html_reader::{HtmlReader, EventSource};
61///
62/// let mut reader = HtmlReader::new("<p>hello</p>");
63/// while let Some(event) = reader.next_event()? {
64/// // Process events...
65/// }
66/// # Ok::<(), docspec_core::Error>(())
67/// ```
68pub struct HtmlReader<'a> {
69 /// Whether the reader is currently inside a `<p>` element.
70 in_paragraph: bool,
71 /// Document processing phase.
72 phase: Phase,
73 /// Queue of `DocSpec` events to emit.
74 queue: VecDeque<Event>,
75 /// The html5gum tokenizer iterator.
76 tokens: Tokenizer<StringReader<'a>>,
77}
78
79impl<'a> HtmlReader<'a> {
80 /// Pops the front event from the queue, if any.
81 fn drain_queue(&mut self) -> Option<Event> {
82 self.queue.pop_front()
83 }
84
85 /// Translates an end tag token into queued events.
86 fn handle_end_tag(&mut self, tag: &html5gum::EndTag<()>) {
87 if &*tag.name == b"p" && self.in_paragraph {
88 self.queue.push_back(Event::EndParagraph);
89 self.in_paragraph = false;
90 }
91 // Orphan </p> or non-p end tags: silently ignore
92 }
93
94 /// Handles end-of-input: auto-closes any open paragraph and emits `EndDocument`.
95 fn handle_eof(&mut self) {
96 if self.in_paragraph {
97 self.queue.push_back(Event::EndParagraph);
98 self.in_paragraph = false;
99 }
100 self.queue.push_back(Event::EndDocument);
101 self.phase = Phase::Finished;
102 }
103
104 /// Translates a start tag token into queued events.
105 fn handle_start_tag(&mut self, tag: &html5gum::StartTag<()>) {
106 if &*tag.name != b"p" || self.in_paragraph {
107 // Nested <p> while already in paragraph: silently ignore (including self-closing nested)
108 // All other tags: silently ignore
109 return;
110 }
111
112 self.queue.push_back(Event::StartParagraph {
113 alignment: None,
114 id: None,
115 });
116 self.in_paragraph = true;
117 if tag.self_closing {
118 self.queue.push_back(Event::EndParagraph);
119 self.in_paragraph = false;
120 }
121 }
122
123 /// Translates a text token into a queued event.
124 ///
125 /// # Errors
126 ///
127 /// Returns `Err` if the text bytes are not valid UTF-8.
128 fn handle_text(&mut self, text_bytes: &[u8]) -> Result<()> {
129 if self.in_paragraph {
130 let text =
131 core::str::from_utf8(text_bytes).map_err(|e| docspec_core::Error::Parse {
132 message: format!("invalid UTF-8 in HTML text: {e}"),
133 position: None,
134 })?;
135 self.queue.push_back(Event::Text {
136 content: text.to_string(),
137 style: TextStyle::default(),
138 });
139 }
140 Ok(())
141 }
142
143 /// Creates a new `HtmlReader` from the given HTML string.
144 ///
145 /// The reader will emit `StartDocument` as its first event and `EndDocument`
146 /// as its last event, with the parsed content events in between.
147 ///
148 /// # Example
149 ///
150 /// ```
151 /// use docspec_html_reader::HtmlReader;
152 ///
153 /// let reader = HtmlReader::new("<p>Hello World</p>");
154 /// ```
155 #[inline]
156 #[must_use]
157 pub fn new(input: &'a str) -> Self {
158 Self {
159 in_paragraph: false,
160 phase: Phase::NotStarted,
161 queue: VecDeque::new(),
162 tokens: Tokenizer::new(input),
163 }
164 }
165}
166
167impl EventSource for HtmlReader<'_> {
168 #[inline]
169 fn next_event(&mut self) -> Result<Option<Event>> {
170 loop {
171 if let Some(event) = self.drain_queue() {
172 return Ok(Some(event));
173 }
174
175 match self.phase {
176 Phase::NotStarted => {
177 self.phase = Phase::Running;
178 self.queue.push_back(Event::StartDocument {
179 id: None,
180 language: None,
181 metadata: None,
182 });
183 }
184 Phase::Finished => {
185 return Ok(None);
186 }
187 Phase::Running => {
188 let Some(result) = self.tokens.next() else {
189 self.handle_eof();
190 continue;
191 };
192 match result {
193 Ok(token) => match token {
194 html5gum::Token::StartTag(tag) => {
195 self.handle_start_tag(&tag);
196 }
197 html5gum::Token::EndTag(tag) => {
198 self.handle_end_tag(&tag);
199 }
200 html5gum::Token::String(spanned) => {
201 self.handle_text(&spanned.value.0)?;
202 }
203 html5gum::Token::Comment(_) | html5gum::Token::Doctype(_) => {
204 // Silently ignore
205 }
206 html5gum::Token::Error(spanned) => {
207 return Err(docspec_core::Error::Parse {
208 message: format!("html5gum: {:?}", spanned.value),
209 position: None,
210 });
211 }
212 },
213 Err(infallible) => match infallible {},
214 }
215 }
216 }
217 }
218 }
219}