html5gum/emitters/
callback.rs

1//! Consume the parsed HTML as a series of events through a callback.
2//!
3//! While using the [crate::DefaultEmitter] provides an easy-to-use API with low performance, and
4//! implementing your own [crate::Emitter] brings maximal performance and maximal pain, this is a middle
5//! ground. All strings are borrowed from some intermediate buffer instead of individually
6//! allocated.
7//!
8//! ```
9//! // Extract all text between span tags, in a naive (but fast) way. Does not handle tags inside of the span. See `examples/` as well.
10//! use html5gum::Tokenizer;
11//! use html5gum::emitters::callback::{CallbackEvent, CallbackEmitter};
12//!
13//! let mut is_in_span = false;
14//! let emitter = CallbackEmitter::new(move |event: CallbackEvent<'_>| -> Option<Vec<u8>> {
15//!     match event {
16//!         CallbackEvent::OpenStartTag { name } => {
17//!             is_in_span = name == b"span";
18//!         },
19//!         CallbackEvent::String { value } if is_in_span => {
20//!             return Some(value.to_vec());
21//!         }
22//!         CallbackEvent::EndTag { .. } => {
23//!             is_in_span = false;
24//!         }
25//!         _ => {}
26//!     }
27//!
28//!     None
29//! });
30//!
31//! let input = r#"<h1><span class=hello>Hello</span> world!</h1>"#;
32//! let Ok(text_fragments) = Tokenizer::new_with_emitter(input, emitter)
33//!     .collect::<Result<Vec<_>, _>>();
34//!
35//! assert_eq!(text_fragments, vec![b"Hello".to_vec()]);
36//! ```
37
38use std::collections::VecDeque;
39use std::convert::Infallible;
40use std::mem::swap;
41
42use crate::utils::trace_log;
43use crate::{naive_next_state, Emitter, Error, State};
44
45/// Events used by [CallbackEmitter].
46///
47/// This operates at a slightly lower level than [crate::Token], as start tags are split up into multiple
48/// events.
49#[derive(Debug)]
50pub enum CallbackEvent<'a> {
51    /// Visit the `"<mytag"` in `"<mytag mykey=myvalue>"`. Signifies the beginning of a new start
52    /// tag.
53    ///
54    /// Attributes have not yet been read.
55    OpenStartTag {
56        /// The name of the start tag.
57        name: &'a [u8],
58    },
59
60    /// Visit an attribute name, for example `"mykey"` in `"<mytag mykey=myvalue>"`.
61    ///
62    /// The attribute value has not yet been read.
63    AttributeName {
64        /// The name of the attribute.
65        name: &'a [u8],
66    },
67
68    /// Visit an attribute value, for example `"myvalue"` in `"<mytag mykey=myvalue>"`.
69    ///
70    /// Things like whitespace, quote handling is taken care of.
71    ///
72    /// After this event, the start tag may be closed using `CloseStartTag`, or another
73    /// `AttributeName` may follow.
74    AttributeValue {
75        /// The value of the attribute.
76        value: &'a [u8],
77    },
78
79    /// Visit the end of the start tag, for example `">"` in `"<mytag mykey=myvalue>"`.
80    ///
81    CloseStartTag {
82        /// Whether the tag ended with `"/>"`.
83        ///
84        /// Note that in HTML5 this difference is largely ignored, and tags are considered
85        /// self-closing based on a hardcoded list of names, not based on syntax.
86        self_closing: bool,
87    },
88
89    /// Visit `"</mytag>"`.
90    ///
91    /// Note: Because of strangeness in the HTML spec, attributes may be observed outside of start
92    /// tags, before this event. It's best to ignore them as they are not valid HTML, but can still
93    /// be observed through most HTML parsers.
94    EndTag {
95        /// The name of the end tag.
96        name: &'a [u8],
97    },
98
99    /// Visit a string, as in, the actual text between tags. The content. Remember actual content
100    /// in HTML, before SPAs took over? I remember.
101    ///
102    /// It's guaranteed that all consecutive "character tokens" (as the spec calls them) are folded
103    /// into one string event.
104    String {
105        /// A series of character tokens.
106        value: &'a [u8],
107    },
108
109    /// Visit a comment, like `<!-- DON'T HACK THIS WEBSITE -->`
110    Comment {
111        /// The contents of the comment.
112        value: &'a [u8],
113    },
114
115    /// Visit `<!DOCTYPE html>`.
116    Doctype {
117        /// Name of the docstring.
118        name: &'a [u8],
119        /// Public identifier (see spec)
120        public_identifier: Option<&'a [u8]>,
121        /// System identifier (see spec)
122        system_identifier: Option<&'a [u8]>,
123        /// Enable quirksmode
124        force_quirks: bool,
125    },
126
127    /// Visit a parsing error.
128    Error(Error),
129}
130
131#[derive(Debug, Clone, Copy)]
132enum CurrentTag {
133    Start,
134    End,
135}
136
137#[derive(Debug)]
138struct CallbackState<F, T> {
139    callback: F,
140    emitted_tokens: VecDeque<T>,
141}
142
143/// This trait is implemented for all functions that have the same signature as
144/// [Callback::handle_event]. The trait only exists in case you want to implement it on a nameable
145/// type.
146pub trait Callback<T> {
147    /// Perform some action on a parsing event, and, optionally, return a value that can be yielded
148    /// from the [crate::Tokenizer] iterator.
149    fn handle_event(&mut self, event: CallbackEvent<'_>) -> Option<T>;
150}
151
152impl<T, F> Callback<T> for F
153where
154    F: FnMut(CallbackEvent<'_>) -> Option<T>,
155{
156    fn handle_event(&mut self, event: CallbackEvent<'_>) -> Option<T> {
157        self(event)
158    }
159}
160
161impl<F, T> CallbackState<F, T>
162where
163    F: Callback<T>,
164{
165    fn emit_event(&mut self, event: CallbackEvent<'_>) {
166        let res = self.callback.handle_event(event);
167        if let Some(token) = res {
168            self.emitted_tokens.push_front(token);
169        }
170    }
171}
172
173impl<F, T> Default for CallbackState<F, T>
174where
175    F: Default,
176{
177    fn default() -> Self {
178        CallbackState {
179            callback: F::default(),
180            emitted_tokens: VecDeque::default(),
181        }
182    }
183}
184
185#[derive(Debug, Default)]
186struct EmitterState {
187    naively_switch_states: bool,
188
189    current_characters: Vec<u8>,
190    current_comment: Vec<u8>,
191
192    last_start_tag: Vec<u8>,
193    current_tag_had_attributes: bool,
194    current_tag_type: Option<CurrentTag>,
195    current_tag_self_closing: bool,
196    current_tag_name: Vec<u8>,
197    current_attribute_name: Vec<u8>,
198    current_attribute_value: Vec<u8>,
199
200    // strings related to doctype
201    doctype_name: Vec<u8>,
202    doctype_has_public_identifier: bool,
203    doctype_has_system_identifier: bool,
204    doctype_public_identifier: Vec<u8>,
205    doctype_system_identifier: Vec<u8>,
206    doctype_force_quirks: bool,
207}
208
209/// The emitter class to pass to [crate::Tokenizer::new_with_emitter]. Please refer to the
210/// module-level documentation on [crate::emitters::callback] for usage.
211#[derive(Debug)]
212pub struct CallbackEmitter<F, T = Infallible> {
213    // this struct is only split out so [CallbackState::emit_event] can borrow things concurrently
214    // with other attributes.
215    callback_state: CallbackState<F, T>,
216    emitter_state: EmitterState,
217}
218
219impl<F, T> Default for CallbackEmitter<F, T>
220where
221    F: Default,
222{
223    fn default() -> Self {
224        CallbackEmitter {
225            callback_state: CallbackState::default(),
226            emitter_state: EmitterState::default(),
227        }
228    }
229}
230
231impl<F, T> CallbackEmitter<F, T>
232where
233    F: Callback<T>,
234{
235    /// Create a new emitter.
236    ///
237    /// The given callback may return optional tokens that then become available through the
238    /// [crate::Tokenizer]'s iterator. If that's not used, return `Option<Infallible>`.
239    pub fn new(callback: F) -> Self {
240        CallbackEmitter {
241            callback_state: CallbackState {
242                callback,
243                emitted_tokens: VecDeque::new(),
244            },
245            emitter_state: EmitterState::default(),
246        }
247    }
248
249    /// Get mutable access to the inner callback.
250    pub fn callback_mut(&mut self) -> &mut F {
251        &mut self.callback_state.callback
252    }
253
254    /// Whether to use [`naive_next_state`] to switch states automatically.
255    ///
256    /// The default is off.
257    pub fn naively_switch_states(&mut self, yes: bool) {
258        self.emitter_state.naively_switch_states = yes;
259    }
260
261    fn flush_attribute_name(&mut self) {
262        if !self.emitter_state.current_attribute_name.is_empty() {
263            self.callback_state
264                .emit_event(CallbackEvent::AttributeName {
265                    name: &self.emitter_state.current_attribute_name,
266                });
267            self.emitter_state.current_attribute_name.clear();
268        }
269    }
270
271    fn flush_attribute(&mut self) {
272        self.flush_attribute_name();
273
274        if !self.emitter_state.current_attribute_value.is_empty() {
275            self.callback_state
276                .emit_event(CallbackEvent::AttributeValue {
277                    value: &self.emitter_state.current_attribute_value,
278                });
279            self.emitter_state.current_attribute_value.clear();
280        }
281    }
282
283    fn flush_open_start_tag(&mut self) {
284        if matches!(self.emitter_state.current_tag_type, Some(CurrentTag::Start))
285            && !self.emitter_state.current_tag_name.is_empty()
286        {
287            self.callback_state.emit_event(CallbackEvent::OpenStartTag {
288                name: &self.emitter_state.current_tag_name,
289            });
290
291            self.emitter_state.last_start_tag.clear();
292            swap(
293                &mut self.emitter_state.last_start_tag,
294                &mut self.emitter_state.current_tag_name,
295            );
296        }
297    }
298
299    fn flush_current_characters(&mut self) {
300        if self.emitter_state.current_characters.is_empty() {
301            return;
302        }
303
304        self.callback_state.emit_event(CallbackEvent::String {
305            value: &self.emitter_state.current_characters,
306        });
307        self.emitter_state.current_characters.clear();
308    }
309}
310impl<F, T> Emitter for CallbackEmitter<F, T>
311where
312    F: Callback<T>,
313{
314    type Token = T;
315
316    fn set_last_start_tag(&mut self, last_start_tag: Option<&[u8]>) {
317        self.emitter_state.last_start_tag.clear();
318        self.emitter_state
319            .last_start_tag
320            .extend(last_start_tag.unwrap_or_default());
321    }
322
323    fn emit_eof(&mut self) {
324        self.flush_current_characters();
325    }
326
327    fn emit_error(&mut self, error: Error) {
328        self.callback_state.emit_event(CallbackEvent::Error(error));
329    }
330
331    fn pop_token(&mut self) -> Option<Self::Token> {
332        self.callback_state.emitted_tokens.pop_back()
333    }
334
335    fn emit_string(&mut self, s: &[u8]) {
336        crate::utils::trace_log!("callbacks: emit_string, len={}", s.len());
337        self.emitter_state.current_characters.extend(s);
338    }
339
340    fn init_start_tag(&mut self) {
341        self.emitter_state.current_tag_name.clear();
342        self.emitter_state.current_tag_type = Some(CurrentTag::Start);
343        self.emitter_state.current_tag_self_closing = false;
344    }
345
346    fn init_end_tag(&mut self) {
347        self.emitter_state.current_tag_name.clear();
348        self.emitter_state.current_tag_type = Some(CurrentTag::End);
349        self.emitter_state.current_tag_had_attributes = false;
350    }
351
352    fn init_comment(&mut self) {
353        self.flush_current_characters();
354        self.emitter_state.current_comment.clear();
355    }
356
357    fn emit_current_tag(&mut self) -> Option<State> {
358        self.flush_attribute();
359        self.flush_current_characters();
360        match self.emitter_state.current_tag_type {
361            Some(CurrentTag::Start) => {
362                self.flush_open_start_tag();
363                self.callback_state
364                    .emit_event(CallbackEvent::CloseStartTag {
365                        self_closing: self.emitter_state.current_tag_self_closing,
366                    });
367            }
368            Some(CurrentTag::End) => {
369                if self.emitter_state.current_tag_had_attributes {
370                    self.emit_error(Error::EndTagWithAttributes);
371                }
372                self.emitter_state.last_start_tag.clear();
373                self.callback_state.emit_event(CallbackEvent::EndTag {
374                    name: &self.emitter_state.current_tag_name,
375                });
376            }
377            _ => {}
378        }
379
380        if self.emitter_state.naively_switch_states {
381            naive_next_state(&self.emitter_state.last_start_tag)
382        } else {
383            None
384        }
385    }
386    fn emit_current_comment(&mut self) {
387        self.callback_state.emit_event(CallbackEvent::Comment {
388            value: &self.emitter_state.current_comment,
389        });
390        self.emitter_state.current_comment.clear();
391    }
392
393    fn emit_current_doctype(&mut self) {
394        self.callback_state.emit_event(CallbackEvent::Doctype {
395            name: &self.emitter_state.doctype_name,
396            public_identifier: if self.emitter_state.doctype_has_public_identifier {
397                Some(&self.emitter_state.doctype_public_identifier)
398            } else {
399                None
400            },
401            system_identifier: if self.emitter_state.doctype_has_system_identifier {
402                Some(&self.emitter_state.doctype_system_identifier)
403            } else {
404                None
405            },
406            force_quirks: self.emitter_state.doctype_force_quirks,
407        });
408    }
409
410    fn set_self_closing(&mut self) {
411        trace_log!("set_self_closing");
412        if matches!(self.emitter_state.current_tag_type, Some(CurrentTag::End)) {
413            self.callback_state
414                .emit_event(CallbackEvent::Error(Error::EndTagWithTrailingSolidus));
415        } else {
416            self.emitter_state.current_tag_self_closing = true;
417        }
418    }
419
420    fn set_force_quirks(&mut self) {
421        self.emitter_state.doctype_force_quirks = true;
422    }
423
424    fn push_tag_name(&mut self, s: &[u8]) {
425        self.emitter_state.current_tag_name.extend(s);
426    }
427
428    fn push_comment(&mut self, s: &[u8]) {
429        self.emitter_state.current_comment.extend(s);
430    }
431
432    fn push_doctype_name(&mut self, s: &[u8]) {
433        self.emitter_state.doctype_name.extend(s);
434    }
435
436    fn init_doctype(&mut self) {
437        self.flush_current_characters();
438        self.emitter_state.doctype_name.clear();
439        self.emitter_state.doctype_has_public_identifier = false;
440        self.emitter_state.doctype_has_system_identifier = false;
441        self.emitter_state.doctype_public_identifier.clear();
442        self.emitter_state.doctype_system_identifier.clear();
443        self.emitter_state.doctype_force_quirks = false;
444    }
445
446    fn init_attribute(&mut self) {
447        self.flush_open_start_tag();
448        self.flush_attribute();
449        self.emitter_state.current_tag_had_attributes = true;
450    }
451
452    fn push_attribute_name(&mut self, s: &[u8]) {
453        self.emitter_state.current_attribute_name.extend(s);
454    }
455
456    fn push_attribute_value(&mut self, s: &[u8]) {
457        self.flush_attribute_name();
458        self.emitter_state.current_attribute_value.extend(s);
459    }
460
461    fn set_doctype_public_identifier(&mut self, value: &[u8]) {
462        self.emitter_state.doctype_has_public_identifier = true;
463        self.emitter_state.doctype_public_identifier.clear();
464        self.emitter_state.doctype_public_identifier.extend(value);
465    }
466    fn set_doctype_system_identifier(&mut self, value: &[u8]) {
467        self.emitter_state.doctype_has_system_identifier = true;
468        self.emitter_state.doctype_system_identifier.clear();
469        self.emitter_state.doctype_system_identifier.extend(value);
470    }
471    fn push_doctype_public_identifier(&mut self, value: &[u8]) {
472        self.emitter_state.doctype_public_identifier.extend(value);
473    }
474    fn push_doctype_system_identifier(&mut self, value: &[u8]) {
475        self.emitter_state.doctype_system_identifier.extend(value);
476    }
477
478    fn current_is_appropriate_end_tag_token(&mut self) -> bool {
479        if self.emitter_state.last_start_tag.is_empty() {
480            crate::utils::trace_log!(
481                "current_is_appropriate_end_tag_token: no, because last_start_tag is empty"
482            );
483            return false;
484        }
485
486        if !matches!(self.emitter_state.current_tag_type, Some(CurrentTag::End)) {
487            crate::utils::trace_log!(
488                "current_is_appropriate_end_tag_token: no, because current_tag_type is not end"
489            );
490            return false;
491        }
492
493        crate::utils::trace_log!(
494            "current_is_appropriate_end_tag_token: last_start_tag = {:?}",
495            self.emitter_state.last_start_tag
496        );
497        crate::utils::trace_log!(
498            "current_is_appropriate_end_tag_token: current_tag = {:?}",
499            self.emitter_state.current_tag_name
500        );
501        self.emitter_state.last_start_tag == self.emitter_state.current_tag_name
502    }
503}