Skip to main content

docspec_html_reader/
lib.rs

1//! HTML to `DocSpec` event stream reader.
2//!
3//! This crate provides an [`HtmlReader`] that implements [`EventSource`] to convert
4//! HTML documents into the `DocSpec` event stream format. It uses `html5gum`
5//! to parse HTML5-compliant markup and emits typed events representing document
6//! structure.
7//!
8//! # Quick Start
9//!
10//! ```
11//! use docspec_html_reader::{HtmlReader, EventSource};
12//!
13//! let html = "<p>Hello world</p>";
14//! let mut reader = HtmlReader::from_str(html);
15//!
16//! while let Some(event) = reader.next_event()? {
17//!     println!("{event:?}");
18//! }
19//! # Ok::<(), docspec_core::Error>(())
20//! ```
21//!
22//! # Supported Elements
23//!
24//! - Paragraphs → `StartParagraph` / `EndParagraph`
25//!
26//! # Unsupported Elements
27//!
28//! All other HTML elements are silently ignored. Text content inside inline
29//! elements (e.g., `<strong>`, `<em>`) is preserved as `Text` events, but
30//! the formatting structure is dropped.
31//!
32//! # Streaming
33//!
34//! `HtmlReader` streams its source via `html5gum::IoReader`'s 16 KB sliding-window
35//! buffer. Memory usage is constant regardless of document size — the document need
36//! not fit in memory. Both [`HtmlReader::from_str`] and [`HtmlReader::from_reader`]
37//! use this streaming path internally.
38
39extern crate alloc;
40
41use alloc::collections::VecDeque;
42use std::io::{Cursor, Read, Seek};
43
44pub use docspec_core::EventSource;
45use docspec_core::{Event, Result, TextStyle};
46use html5gum::{IoReader, Tokenizer};
47
48/// Document processing phase.
49#[derive(Clone, Copy, PartialEq, Eq)]
50enum Phase {
51    /// `EndDocument` has been emitted.
52    Finished,
53    /// `StartDocument` not yet emitted.
54    NotStarted,
55    /// Processing events between `StartDocument` and `EndDocument`.
56    Running,
57}
58
59/// A streaming HTML reader that implements [`EventSource`].
60///
61/// `HtmlReader` parses HTML using `html5gum` and emits `DocSpec` events
62/// one at a time. Only `<p>` paragraph elements are recognized; all other
63/// elements are silently ignored.
64///
65/// # Example
66///
67/// ```
68/// use docspec_html_reader::{HtmlReader, EventSource};
69///
70/// let mut reader = HtmlReader::from_str("<p>hello</p>");
71/// while let Some(event) = reader.next_event()? {
72///     // Process events...
73/// }
74/// # Ok::<(), docspec_core::Error>(())
75/// ```
76pub struct HtmlReader {
77    /// Whether the reader is currently inside a `<p>` element.
78    in_paragraph: bool,
79    /// Document processing phase.
80    phase: Phase,
81    /// Queue of `DocSpec` events to emit.
82    queue: VecDeque<Event>,
83    /// The html5gum tokenizer iterator.
84    tokens: Tokenizer<IoReader<Box<dyn Read + Send>>>,
85}
86
87impl HtmlReader {
88    /// Pops the front event from the queue, if any.
89    fn drain_queue(&mut self) -> Option<Event> {
90        self.queue.pop_front()
91    }
92
93    fn from_boxed_reader(reader: Box<dyn Read + Send>) -> Self {
94        Self {
95            in_paragraph: false,
96            phase: Phase::NotStarted,
97            queue: VecDeque::new(),
98            tokens: Tokenizer::new(IoReader::new(reader)),
99        }
100    }
101
102    /// Creates an `HtmlReader` from any `Read + Seek` source.
103    ///
104    /// The source is streamed via `html5gum::IoReader`'s 16 KB sliding-window
105    /// buffer. Memory usage is constant regardless of document size.
106    ///
107    /// The `Seek` bound is required for API symmetry with other `DocSpec` readers
108    /// (e.g., `DocxReader`), even though `html5gum::IoReader` only needs `Read`.
109    ///
110    /// # Errors
111    ///
112    /// This constructor does not read from the source eagerly and therefore
113    /// currently cannot fail.
114    #[inline]
115    pub fn from_reader<R: Read + Seek + Send + 'static>(reader: R) -> Result<Self> {
116        let boxed: Box<dyn Read + Send> = Box::new(reader);
117        Ok(Self::from_boxed_reader(boxed))
118    }
119
120    /// Creates an `HtmlReader` from a string slice.
121    ///
122    /// The input bytes are copied into an owned `Vec<u8>` and streamed via
123    /// `html5gum::IoReader`'s 16 KB sliding-window buffer.
124    #[expect(
125        clippy::should_implement_trait,
126        reason = "Public API requires an infallible constructor named from_str."
127    )]
128    #[inline]
129    #[must_use]
130    pub fn from_str(input: &str) -> Self {
131        let bytes: Vec<u8> = input.as_bytes().to_vec();
132        let reader: Box<dyn Read + Send> = Box::new(Cursor::new(bytes));
133        Self::from_boxed_reader(reader)
134    }
135
136    /// Translates an end tag token into queued events.
137    fn handle_end_tag(&mut self, tag: &html5gum::EndTag<()>) {
138        if &*tag.name == b"p" && self.in_paragraph {
139            self.queue.push_back(Event::EndParagraph);
140            self.in_paragraph = false;
141        }
142        // Orphan </p> or non-p end tags: silently ignore
143    }
144
145    /// Handles end-of-input: auto-closes any open paragraph and emits `EndDocument`.
146    fn handle_eof(&mut self) {
147        if self.in_paragraph {
148            self.queue.push_back(Event::EndParagraph);
149            self.in_paragraph = false;
150        }
151        self.queue.push_back(Event::EndDocument);
152        self.phase = Phase::Finished;
153    }
154
155    /// Translates a start tag token into queued events.
156    fn handle_start_tag(&mut self, tag: &html5gum::StartTag<()>) {
157        if &*tag.name != b"p" || self.in_paragraph {
158            // Nested <p> while already in paragraph: silently ignore (including self-closing nested)
159            // All other tags: silently ignore
160            return;
161        }
162
163        self.queue.push_back(Event::StartParagraph {
164            alignment: None,
165            id: None,
166        });
167        self.in_paragraph = true;
168        if tag.self_closing {
169            self.queue.push_back(Event::EndParagraph);
170            self.in_paragraph = false;
171        }
172    }
173
174    /// Translates a text token into a queued event.
175    ///
176    /// # Errors
177    ///
178    /// Returns `Err` if the text bytes are not valid UTF-8.
179    fn handle_text(&mut self, text_bytes: &[u8]) -> Result<()> {
180        if self.in_paragraph {
181            let text =
182                core::str::from_utf8(text_bytes).map_err(|e| docspec_core::Error::Parse {
183                    message: format!("invalid UTF-8 in HTML text: {e}"),
184                    position: None,
185                })?;
186            self.queue.push_back(Event::Text {
187                content: text.to_string(),
188                style: TextStyle::default(),
189            });
190        }
191        Ok(())
192    }
193}
194
195impl EventSource for HtmlReader {
196    #[inline]
197    fn next_event(&mut self) -> Result<Option<Event>> {
198        loop {
199            if let Some(event) = self.drain_queue() {
200                return Ok(Some(event));
201            }
202
203            match self.phase {
204                Phase::NotStarted => {
205                    self.phase = Phase::Running;
206                    self.queue.push_back(Event::StartDocument {
207                        id: None,
208                        language: None,
209                        metadata: None,
210                    });
211                }
212                Phase::Finished => {
213                    return Ok(None);
214                }
215                Phase::Running => {
216                    let Some(result) = self.tokens.next() else {
217                        self.handle_eof();
218                        continue;
219                    };
220                    match result {
221                        Ok(token) => match token {
222                            html5gum::Token::StartTag(tag) => {
223                                self.handle_start_tag(&tag);
224                            }
225                            html5gum::Token::EndTag(tag) => {
226                                self.handle_end_tag(&tag);
227                            }
228                            html5gum::Token::String(spanned) => {
229                                self.handle_text(&spanned.value.0)?;
230                            }
231                            html5gum::Token::Comment(_) | html5gum::Token::Doctype(_) => {
232                                // Silently ignore
233                            }
234                            html5gum::Token::Error(spanned) => {
235                                return Err(docspec_core::Error::Parse {
236                                    message: format!("html5gum: {:?}", spanned.value),
237                                    position: None,
238                                });
239                            }
240                        },
241                        Err(source) => return Err(docspec_core::Error::Io { source }),
242                    }
243                }
244            }
245        }
246    }
247}
248
249#[cfg(test)]
250mod send_static_assertions {
251    fn assert_send_static<T: Send + 'static>() {}
252
253    #[test]
254    fn html_reader_is_send_static() {
255        assert_send_static::<crate::HtmlReader>();
256    }
257}