html5tokenizer/
tokenizer.rs

1pub(crate) mod machine;
2
3use crate::offset::{Offset, Position};
4use crate::reader::{IntoReader, Reader};
5use crate::Emitter;
6use machine::ControlToken;
7
8#[cfg(feature = "integration-tests")]
9pub use machine::State as InternalState;
10
11/// An HTML tokenizer.
12///
13/// # Warning
14///
15/// Iterating over the tokenizer directly without calling [`Tokenizer::set_state`]
16/// results in wrong state transitions:
17///
18/// ```
19/// # use html5tokenizer::{BasicEmitter, Event, Tokenizer, Token};
20/// let emitter = BasicEmitter::default();
21/// let html = "<script><b>";
22/// let mut tokens = Tokenizer::new(html, emitter).flatten();
23/// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_)))));
24/// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_)))));
25/// ```
26///
27/// Instead use the [`NaiveParser`] (in the future this crate will also provide a proper implementation of [tree construction]).
28///
29/// [`NaiveParser`]: crate::NaiveParser
30/// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
31pub struct Tokenizer<R, O, E> {
32    machine: machine::Machine<R, O, E>,
33    eof: bool,
34}
35
36impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
37    /// Creates a new tokenizer from some input and an emitter.
38    ///
39    /// Note that properly parsing HTML with this tokenizer requires you to
40    /// implement [tree construction] and call [`Tokenizer::set_state`] accordingly.
41    ///
42    /// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
43    pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self {
44        Tokenizer {
45            machine: machine::Machine::new(reader.into_reader(), emitter),
46            eof: false,
47        }
48    }
49
50    /// To be called when the tokenizer iterator implementation yields [`Event::CdataOpen`].
51    ///
52    /// For spec-compliant parsing the supplied boolean must be `true`
53    /// if there is an _adjusted current node_ and it is not an element in
54    /// the HTML namespace, or `false` otherwise (as per the third condition
55    /// under [Markup declaration open state]).
56    ///
57    /// [Markup declaration open state]: https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
58    pub fn handle_cdata_open(
59        &mut self,
60        adjusted_current_node_present_and_not_in_html_namespace: bool,
61    ) {
62        machine::handle_cdata_open(
63            &mut self.machine,
64            adjusted_current_node_present_and_not_in_html_namespace,
65        );
66    }
67
68    /// Returns a mutable reference to the emitter.
69    pub fn emitter_mut(&mut self) -> &mut E {
70        &mut self.machine.emitter
71    }
72}
73
74/// An event yielded by the [`Iterator`] implementation for the [`Tokenizer`].
75#[derive(Clone, Debug)]
76pub enum Event<T> {
77    /// A token emitted by the [`Emitter`].
78    Token(T),
79    /// The state machine encountered `<![CDATA[`. You must call [`Tokenizer::handle_cdata_open`],
80    /// before advancing the tokenizer iterator again.
81    CdataOpen,
82}
83
84/// The states you can set the tokenizer to.
85#[derive(Debug)]
86#[non_exhaustive]
87pub enum State {
88    /// The [data state].
89    ///
90    /// [data state]: https://html.spec.whatwg.org/multipage/parsing.html#data-state
91    Data,
92    /// The [PLAINTEXT state].
93    ///
94    /// [PLAINTEXT state]: https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
95    Plaintext,
96    /// The [RCDATA state].
97    ///
98    /// [RCDATA state]: https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
99    Rcdata,
100    /// The [RAWTEXT state].
101    ///
102    /// [RAWTEXT state]: https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
103    Rawtext,
104    /// The [script data state].
105    ///
106    /// [script data state]: https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
107    ScriptData,
108}
109
110impl From<State> for machine::State {
111    fn from(state: State) -> Self {
112        match state {
113            State::Data => machine::State::Data,
114            State::Plaintext => machine::State::Plaintext,
115            State::Rcdata => machine::State::Rcdata,
116            State::Rawtext => machine::State::Rawtext,
117            State::ScriptData => machine::State::ScriptData,
118        }
119    }
120}
121
122impl<R, O, E> Tokenizer<R, O, E> {
123    /// Test-internal function to override internal state.
124    ///
125    /// Only available with the `integration-tests` feature which is not public API.
126    #[cfg(feature = "integration-tests")]
127    pub fn set_internal_state(&mut self, state: InternalState) {
128        self.machine.state = state;
129    }
130
131    /// Set the statemachine to start/continue in the given state.
132    pub fn set_state(&mut self, state: State) {
133        self.machine.state = state.into();
134    }
135}
136
137impl<O, R, E> Iterator for Tokenizer<R, O, E>
138where
139    O: Offset,
140    R: Reader + Position<O>,
141    E: Emitter<O> + Iterator,
142{
143    type Item = Result<Event<E::Item>, R::Error>;
144
145    fn next(&mut self) -> Option<Self::Item> {
146        loop {
147            if let Some(token) = self.machine.emitter.next() {
148                return Some(Ok(Event::Token(token)));
149            }
150
151            if self.eof {
152                return None;
153            }
154
155            match machine::consume(&mut self.machine) {
156                Err(e) => return Some(Err(e)),
157                Ok(ControlToken::Continue) => (),
158                Ok(ControlToken::Eof) => {
159                    self.eof = true;
160                    self.machine
161                        .emitter
162                        .emit_eof(self.machine.reader_position());
163                }
164                Ok(ControlToken::CdataOpen) => return Some(Ok(Event::CdataOpen)),
165            }
166        }
167    }
168}
169
170impl<R, O, E> Tokenizer<R, O, E> {
171    pub(crate) fn enable_naive_state_switching(&mut self) {
172        self.machine.naively_switch_state = true;
173    }
174
175    /// Test-internal function to override internal state.
176    ///
177    /// Only available with the `integration-tests` feature which is not public API.
178    #[cfg(feature = "integration-tests")]
179    pub fn set_last_start_tag(&mut self, last_start_tag: &str) {
180        self.machine.last_start_tag_name.clear();
181        self.machine.last_start_tag_name.push_str(last_start_tag);
182    }
183}