Skip to main content

docspec_html_reader/
lib.rs

1//! HTML to `DocSpec` event stream reader.
2//!
3//! This crate provides an [`HtmlReader`] that implements [`EventSource`] to convert
4//! HTML documents into the `DocSpec` event stream format. It uses `html5gum`
5//! to parse HTML5-compliant markup and emits typed events representing document
6//! structure.
7//!
8//! # Quick Start
9//!
10//! ```
11//! use docspec_html_reader::{HtmlReader, EventSource};
12//!
13//! let html = "<p>Hello world</p>";
14//! let mut reader = HtmlReader::new(html);
15//!
16//! while let Some(event) = reader.next_event()? {
17//!     println!("{event:?}");
18//! }
19//! # Ok::<(), docspec_core::Error>(())
20//! ```
21//!
22//! # Supported Elements
23//!
24//! - Paragraphs → `StartParagraph` / `EndParagraph`
25//!
26//! # Unsupported Elements
27//!
28//! All other HTML elements are silently ignored. Text content inside inline
29//! elements (e.g., `<strong>`, `<em>`) is preserved as `Text` events, but
30//! the formatting structure is dropped.
31
32extern crate alloc;
33
34use alloc::collections::VecDeque;
35
36pub use docspec_core::EventSource;
37use docspec_core::{Event, Result, TextStyle};
38use html5gum::{StringReader, Tokenizer};
39
40/// Document processing phase.
41#[derive(Clone, Copy, PartialEq, Eq)]
42enum Phase {
43    /// `EndDocument` has been emitted.
44    Finished,
45    /// `StartDocument` not yet emitted.
46    NotStarted,
47    /// Processing events between `StartDocument` and `EndDocument`.
48    Running,
49}
50
51/// A streaming HTML reader that implements [`EventSource`].
52///
53/// `HtmlReader` parses HTML using `html5gum` and emits `DocSpec` events
54/// one at a time. Only `<p>` paragraph elements are recognized; all other
55/// elements are silently ignored.
56///
57/// # Example
58///
59/// ```
60/// use docspec_html_reader::{HtmlReader, EventSource};
61///
62/// let mut reader = HtmlReader::new("<p>hello</p>");
63/// while let Some(event) = reader.next_event()? {
64///     // Process events...
65/// }
66/// # Ok::<(), docspec_core::Error>(())
67/// ```
68pub struct HtmlReader<'a> {
69    /// Whether the reader is currently inside a `<p>` element.
70    in_paragraph: bool,
71    /// Document processing phase.
72    phase: Phase,
73    /// Queue of `DocSpec` events to emit.
74    queue: VecDeque<Event>,
75    /// The html5gum tokenizer iterator.
76    tokens: Tokenizer<StringReader<'a>>,
77}
78
79impl<'a> HtmlReader<'a> {
80    /// Pops the front event from the queue, if any.
81    fn drain_queue(&mut self) -> Option<Event> {
82        self.queue.pop_front()
83    }
84
85    /// Translates an end tag token into queued events.
86    fn handle_end_tag(&mut self, tag: &html5gum::EndTag<()>) {
87        if &*tag.name == b"p" && self.in_paragraph {
88            self.queue.push_back(Event::EndParagraph);
89            self.in_paragraph = false;
90        }
91        // Orphan </p> or non-p end tags: silently ignore
92    }
93
94    /// Handles end-of-input: auto-closes any open paragraph and emits `EndDocument`.
95    fn handle_eof(&mut self) {
96        if self.in_paragraph {
97            self.queue.push_back(Event::EndParagraph);
98            self.in_paragraph = false;
99        }
100        self.queue.push_back(Event::EndDocument);
101        self.phase = Phase::Finished;
102    }
103
104    /// Translates a start tag token into queued events.
105    fn handle_start_tag(&mut self, tag: &html5gum::StartTag<()>) {
106        if &*tag.name != b"p" || self.in_paragraph {
107            // Nested <p> while already in paragraph: silently ignore (including self-closing nested)
108            // All other tags: silently ignore
109            return;
110        }
111
112        self.queue.push_back(Event::StartParagraph {
113            alignment: None,
114            id: None,
115        });
116        self.in_paragraph = true;
117        if tag.self_closing {
118            self.queue.push_back(Event::EndParagraph);
119            self.in_paragraph = false;
120        }
121    }
122
123    /// Translates a text token into a queued event.
124    ///
125    /// # Errors
126    ///
127    /// Returns `Err` if the text bytes are not valid UTF-8.
128    fn handle_text(&mut self, text_bytes: &[u8]) -> Result<()> {
129        if self.in_paragraph {
130            let text =
131                core::str::from_utf8(text_bytes).map_err(|e| docspec_core::Error::Parse {
132                    message: format!("invalid UTF-8 in HTML text: {e}"),
133                    position: None,
134                })?;
135            self.queue.push_back(Event::Text {
136                content: text.to_string(),
137                style: TextStyle::default(),
138            });
139        }
140        Ok(())
141    }
142
143    /// Creates a new `HtmlReader` from the given HTML string.
144    ///
145    /// The reader will emit `StartDocument` as its first event and `EndDocument`
146    /// as its last event, with the parsed content events in between.
147    ///
148    /// # Example
149    ///
150    /// ```
151    /// use docspec_html_reader::HtmlReader;
152    ///
153    /// let reader = HtmlReader::new("<p>Hello World</p>");
154    /// ```
155    #[inline]
156    #[must_use]
157    pub fn new(input: &'a str) -> Self {
158        Self {
159            in_paragraph: false,
160            phase: Phase::NotStarted,
161            queue: VecDeque::new(),
162            tokens: Tokenizer::new(input),
163        }
164    }
165}
166
167impl EventSource for HtmlReader<'_> {
168    #[inline]
169    fn next_event(&mut self) -> Result<Option<Event>> {
170        loop {
171            if let Some(event) = self.drain_queue() {
172                return Ok(Some(event));
173            }
174
175            match self.phase {
176                Phase::NotStarted => {
177                    self.phase = Phase::Running;
178                    self.queue.push_back(Event::StartDocument {
179                        id: None,
180                        language: None,
181                        metadata: None,
182                    });
183                }
184                Phase::Finished => {
185                    return Ok(None);
186                }
187                Phase::Running => {
188                    let Some(result) = self.tokens.next() else {
189                        self.handle_eof();
190                        continue;
191                    };
192                    match result {
193                        Ok(token) => match token {
194                            html5gum::Token::StartTag(tag) => {
195                                self.handle_start_tag(&tag);
196                            }
197                            html5gum::Token::EndTag(tag) => {
198                                self.handle_end_tag(&tag);
199                            }
200                            html5gum::Token::String(spanned) => {
201                                self.handle_text(&spanned.value.0)?;
202                            }
203                            html5gum::Token::Comment(_) | html5gum::Token::Doctype(_) => {
204                                // Silently ignore
205                            }
206                            html5gum::Token::Error(spanned) => {
207                                return Err(docspec_core::Error::Parse {
208                                    message: format!("html5gum: {:?}", spanned.value),
209                                    position: None,
210                                });
211                            }
212                        },
213                        Err(infallible) => match infallible {},
214                    }
215                }
216            }
217        }
218    }
219}