html5tokenizer/tokenizer.rs
1pub(crate) mod machine;
2
3use crate::offset::{Offset, Position};
4use crate::reader::{IntoReader, Reader};
5use crate::Emitter;
6use machine::ControlToken;
7
8#[cfg(feature = "integration-tests")]
9pub use machine::State as InternalState;
10
11/// An HTML tokenizer.
12///
13/// # Warning
14///
15/// Iterating over the tokenizer directly without calling [`Tokenizer::set_state`]
16/// results in wrong state transitions:
17///
18/// ```
19/// # use html5tokenizer::{BasicEmitter, Event, Tokenizer, Token};
20/// let emitter = BasicEmitter::default();
21/// let html = "<script><b>";
22/// let mut tokens = Tokenizer::new(html, emitter).flatten();
23/// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_)))));
24/// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_)))));
25/// ```
26///
27/// Instead use the [`NaiveParser`] (in the future this crate will also provide a proper implementation of [tree construction]).
28///
29/// [`NaiveParser`]: crate::NaiveParser
30/// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
31pub struct Tokenizer<R, O, E> {
32 machine: machine::Machine<R, O, E>,
33 eof: bool,
34}
35
36impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
37 /// Creates a new tokenizer from some input and an emitter.
38 ///
39 /// Note that properly parsing HTML with this tokenizer requires you to
40 /// implement [tree construction] and call [`Tokenizer::set_state`] accordingly.
41 ///
42 /// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
43 pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self {
44 Tokenizer {
45 machine: machine::Machine::new(reader.into_reader(), emitter),
46 eof: false,
47 }
48 }
49
50 /// To be called when the tokenizer iterator implementation yields [`Event::CdataOpen`].
51 ///
52 /// For spec-compliant parsing the supplied boolean must be `true`
53 /// if there is an _adjusted current node_ and it is not an element in
54 /// the HTML namespace, or `false` otherwise (as per the third condition
55 /// under [Markup declaration open state]).
56 ///
57 /// [Markup declaration open state]: https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
58 pub fn handle_cdata_open(
59 &mut self,
60 adjusted_current_node_present_and_not_in_html_namespace: bool,
61 ) {
62 machine::handle_cdata_open(
63 &mut self.machine,
64 adjusted_current_node_present_and_not_in_html_namespace,
65 );
66 }
67
68 /// Returns a mutable reference to the emitter.
69 pub fn emitter_mut(&mut self) -> &mut E {
70 &mut self.machine.emitter
71 }
72}
73
74/// An event yielded by the [`Iterator`] implementation for the [`Tokenizer`].
75#[derive(Clone, Debug)]
76pub enum Event<T> {
77 /// A token emitted by the [`Emitter`].
78 Token(T),
79 /// The state machine encountered `<![CDATA[`. You must call [`Tokenizer::handle_cdata_open`],
80 /// before advancing the tokenizer iterator again.
81 CdataOpen,
82}
83
84/// The states you can set the tokenizer to.
85#[derive(Debug)]
86#[non_exhaustive]
87pub enum State {
88 /// The [data state].
89 ///
90 /// [data state]: https://html.spec.whatwg.org/multipage/parsing.html#data-state
91 Data,
92 /// The [PLAINTEXT state].
93 ///
94 /// [PLAINTEXT state]: https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
95 Plaintext,
96 /// The [RCDATA state].
97 ///
98 /// [RCDATA state]: https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
99 Rcdata,
100 /// The [RAWTEXT state].
101 ///
102 /// [RAWTEXT state]: https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
103 Rawtext,
104 /// The [script data state].
105 ///
106 /// [script data state]: https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
107 ScriptData,
108}
109
110impl From<State> for machine::State {
111 fn from(state: State) -> Self {
112 match state {
113 State::Data => machine::State::Data,
114 State::Plaintext => machine::State::Plaintext,
115 State::Rcdata => machine::State::Rcdata,
116 State::Rawtext => machine::State::Rawtext,
117 State::ScriptData => machine::State::ScriptData,
118 }
119 }
120}
121
122impl<R, O, E> Tokenizer<R, O, E> {
123 /// Test-internal function to override internal state.
124 ///
125 /// Only available with the `integration-tests` feature which is not public API.
126 #[cfg(feature = "integration-tests")]
127 pub fn set_internal_state(&mut self, state: InternalState) {
128 self.machine.state = state;
129 }
130
131 /// Set the statemachine to start/continue in the given state.
132 pub fn set_state(&mut self, state: State) {
133 self.machine.state = state.into();
134 }
135}
136
137impl<O, R, E> Iterator for Tokenizer<R, O, E>
138where
139 O: Offset,
140 R: Reader + Position<O>,
141 E: Emitter<O> + Iterator,
142{
143 type Item = Result<Event<E::Item>, R::Error>;
144
145 fn next(&mut self) -> Option<Self::Item> {
146 loop {
147 if let Some(token) = self.machine.emitter.next() {
148 return Some(Ok(Event::Token(token)));
149 }
150
151 if self.eof {
152 return None;
153 }
154
155 match machine::consume(&mut self.machine) {
156 Err(e) => return Some(Err(e)),
157 Ok(ControlToken::Continue) => (),
158 Ok(ControlToken::Eof) => {
159 self.eof = true;
160 self.machine
161 .emitter
162 .emit_eof(self.machine.reader_position());
163 }
164 Ok(ControlToken::CdataOpen) => return Some(Ok(Event::CdataOpen)),
165 }
166 }
167 }
168}
169
170impl<R, O, E> Tokenizer<R, O, E> {
171 pub(crate) fn enable_naive_state_switching(&mut self) {
172 self.machine.naively_switch_state = true;
173 }
174
175 /// Test-internal function to override internal state.
176 ///
177 /// Only available with the `integration-tests` feature which is not public API.
178 #[cfg(feature = "integration-tests")]
179 pub fn set_last_start_tag(&mut self, last_start_tag: &str) {
180 self.machine.last_start_tag_name.clear();
181 self.machine.last_start_tag_name.push_str(last_start_tag);
182 }
183}